#Libraries

# Load packages
library(pacman)
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, 
       cobalt, Matchit, Zelig, estimatr, cem, tidyverse, 
       lubridate, usmap, gridExtra, stringr, readxl, plot3D,  
       cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,  
       foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, 
       kableExtra, snakecase, janitor)
library(fixest)

## 
## Attaching package: 'fixest'

## The following object is masked from 'package:scales':
## 
##     pvalue

## The following object is masked from 'package:Jmisc':
## 
##     demean

library(multiwayvcov)
library(lmtest)
library(stargazer)
library(modelsummary)
library(causalweight)

## Loading required package: ranger

library(fixest)

Problem 1

install.packages('devtools')

## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'

## package 'devtools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\zxl5576\AppData\Local\Temp\RtmpyO7CmZ\downloaded_packages

devtools::install_github('synth-inference/synthdid', ref='sdid-paper')

## Using GitHub PAT from the git credential store.

## Skipping install of 'synthdid' from a github remote, the SHA1 (36923192) has not changed since last install.
##   Use `force = TRUE` to force installation

install.packages(c('dplyr', 'tidyr', 'tibble', 'ggplot2', 'devtools', 'xtable'))

## Warning: packages 'dplyr', 'tidyr', 'tibble', 'ggplot2' are in use and will not
## be installed

## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'

## package 'devtools' successfully unpacked and MD5 sums checked
## package 'xtable' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\zxl5576\AppData\Local\Temp\RtmpyO7CmZ\downloaded_packages

devtools::install_github('cran/glmnet',              ref='f4fc95ab49efaad9b6e1728a7c840bc6159501dc')

## Using GitHub PAT from the git credential store.

## Skipping install of 'glmnet' from a github remote, the SHA1 (f4fc95ab) has not changed since last install.
##   Use `force = TRUE` to force installation

devtools::install_github('susanathey/MCPanel',       ref='6b2706fd7c35f3266048ceb22a7e9a61ae1774da')

## Using GitHub PAT from the git credential store.

## Skipping install of 'MCPanel' from a github remote, the SHA1 (6b2706fd) has not changed since last install.
##   Use `force = TRUE` to force installation

i

library(synthdid)
data('california_prop99')
setup = panel.matrices(california_prop99)
tau.hat = synthdid_estimate(setup$Y, setup$N0, setup$T0)

tau.hat

## synthdid: -15.604 +- NA. Effective N0/N0 = 16.4/38~0.4. Effective T0/T0 = 2.8/19~0.1. N1,T1 = 1,12.

print(summary(tau.hat))

## $estimate
## [1] -15.60383
## 
## $se
##      [,1]
## [1,]   NA
## 
## $controls
##                estimate 1
## Nevada              0.124
## New Hampshire       0.105
## Connecticut         0.078
## Delaware            0.070
## Colorado            0.058
## Illinois            0.053
## Nebraska            0.048
## Montana             0.045
## Utah                0.042
## New Mexico          0.041
## Minnesota           0.039
## Wisconsin           0.037
## West Virginia       0.034
## North Carolina      0.033
## Idaho               0.031
## Ohio                0.031
## Maine               0.028
## Iowa                0.026
## 
## $periods
##      estimate 1
## 1988      0.427
## 1986      0.366
## 1987      0.206
## 
## $dimensions
##           N1           N0 N0.effective           T1           T0 T0.effective 
##        1.000       38.000       16.388       12.000       19.000        2.783

se = sqrt(vcov(tau.hat, method='placebo'))
sprintf('point estimate: %1.2f', tau.hat)

## [1] "point estimate: -15.60"

sprintf('95%% CI (%1.2f, %1.2f)', tau.hat - 1.96 * se, tau.hat + 1.96 * se)

## [1] "95% CI (-35.41, 4.20)"

plot(tau.hat, se.method='placebo')

library(parallel)
library(doSNOW)

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: iterators

## Loading required package: snow

## 
## Attaching package: 'snow'

## The following objects are masked from 'package:parallel':
## 
##     closeNode, clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, clusterSplit, makeCluster, parApply,
##     parCapply, parLapply, parRapply, parSapply, recvData, recvOneData,
##     sendData, splitIndices, stopCluster

library(foreach)

# -----------------------------------------
# Set up parallel backend (you can turn this off with paraprocess = 0)
# -----------------------------------------
paraprocess <- 1
if (paraprocess == 1) {
  availcores <- detectCores()
  numcores   <- max(1, floor(availcores / 1.2))
  cl <- makeCluster(numcores, type = "SOCK")
  registerDoSNOW(cl)
  clusterSetRNGStream(cl, 12345)  # Ensures reproducibility
}

library(MCPanel)

did_estimate = function(Y, N0, T0) {
  synthdid::did_estimate(Y, N0, T0)
}

sc_estimate_reg = function(Y, N0, T0) {
  synthdid::sc_estimate(Y, N0, T0, eta.omega = ((nrow(Y) - N0) * (ncol(Y) - T0))^(1/4))
}

sdid_estimate = function(Y, N0, T0) {
  synthdid::synthdid_estimate(Y, N0, T0)
}

mc_estimate = function(Y, N0, T0) {
    N1=nrow(Y)-N0
    T1=ncol(Y)-T0
    W <- outer(c(rep(0,N0),rep(1,N1)),c(rep(0,T0),rep(1,T1)))
    mc_pred <- mcnnm_cv(Y, 1-W, num_lam_L = 20)
    mc_fit  <- mc_pred$L + outer(mc_pred$u, mc_pred$v, '+')
    mc_est <- sum(W*(Y-mc_fit))/sum(W)
    mc_est
}

mc_placebo_se = function(Y, N0, T0, replications=200) {
    N1 = nrow(Y) - N0
    theta = function(ind) { mc_estimate(Y[ind,], length(ind)-N1, T0) }
    sqrt((replications-1)/replications) * sd(replicate(replications, theta(sample(1:N0))))
}  


difp_estimate = function(Y, N0, T0) {
    synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)), eta.omega=1e-6)
}


difp_estimate_reg = function(Y, N0, T0) {
    synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)))
}


estimators = list(did=did_estimate,
                  sc=sc_estimate,
                  sdid=synthdid_estimate,
                  difp=difp_estimate,
                  mc = mc_estimate,
                  sc_reg = sc_estimate_reg,
                  difp_reg = difp_estimate_reg)

library(synthdid)
library(ggplot2)
set.seed(12345)


data('california_prop99')
setup <- synthdid::panel.matrices(california_prop99)

estimates <- lapply(estimators, function(estimator) {
  estimator(setup$Y, setup$N0, setup$T0)
})

standard.errors = mapply(function(estimate, name) {
  set.seed(12345)
  if(name == 'mc') { mc_placebo_se(setup$Y, setup$N0, setup$T0) }
  else {             sqrt(vcov(estimate, method='placebo'))     }
}, estimates, names(estimators))

str(estimates)

## List of 7
##  $ did     : 'synthdid_estimate' num [1, 1] -27.3
##   ..- attr(*, "estimator")= chr "did_estimate"
##   ..- attr(*, "weights")=List of 2
##   .. ..$ lambda: num [1:19] 0.0526 0.0526 0.0526 0.0526 0.0526 ...
##   .. ..$ omega : num [1:38] 0.0263 0.0263 0.0263 0.0263 0.0263 ...
##   ..- attr(*, "setup")=List of 4
##   .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
##   .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
##   .. ..$ X : logi[1:39, 1:31, 0 ] 
##   .. ..$ N0: int 38
##   .. ..$ T0: num 19
##   ..- attr(*, "opts")=List of 8
##   .. ..$ zeta.omega      : num 10.2
##   .. ..$ zeta.lambda     : num 5.49e-06
##   .. ..$ omega.intercept : logi TRUE
##   .. ..$ lambda.intercept: logi TRUE
##   .. ..$ update.omega    : logi FALSE
##   .. ..$ update.lambda   : logi FALSE
##   .. ..$ min.decrease    : num 5.49e-05
##   .. ..$ max.iter        : num 10000
##  $ sc      : 'synthdid_estimate' num [1, 1] -19.6
##   ..- attr(*, "estimator")= chr "sc_estimate"
##   ..- attr(*, "weights")=List of 4
##   .. ..$ lambda    : num [1:19] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..$ omega     : num [1:38] 0 0 0.01332 0.10447 0.00405 ...
##   .. ..$ omega.vals: num [1:10000] 7.13 6.93 6.77 6.64 6.52 ...
##   .. ..$ vals      : num [1:10000] 7.13 6.93 6.77 6.64 6.52 ...
##   ..- attr(*, "setup")=List of 4
##   .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
##   .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
##   .. ..$ X : logi[1:39, 1:31, 0 ] 
##   .. ..$ N0: int 38
##   .. ..$ T0: num 19
##   ..- attr(*, "opts")=List of 8
##   .. ..$ zeta.omega      : num 5.49e-06
##   .. ..$ zeta.lambda     : num 5.49e-06
##   .. ..$ omega.intercept : logi FALSE
##   .. ..$ lambda.intercept: logi TRUE
##   .. ..$ update.omega    : logi TRUE
##   .. ..$ update.lambda   : logi FALSE
##   .. ..$ min.decrease    : num 5.49e-05
##   .. ..$ max.iter        : num 10000
##  $ sdid    : 'synthdid_estimate' num [1, 1] -15.6
##   ..- attr(*, "estimator")= chr "synthdid_estimate"
##   ..- attr(*, "weights")=List of 5
##   .. ..$ omega      : num [1:38] 0 0.00344 0.05751 0.07829 0.07037 ...
##   .. ..$ lambda     : num [1:19] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..$ lambda.vals: num [1:10000] 77.3 77.3 77.3 77.3 77.3 ...
##   .. ..$ vals       : num [1:10000] 90.5 89.8 89.5 89 88.9 ...
##   .. ..$ omega.vals : num [1:10000] 13.2 12.5 12.1 11.7 11.6 ...
##   ..- attr(*, "setup")=List of 4
##   .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
##   .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
##   .. ..$ X : logi[1:39, 1:31, 0 ] 
##   .. ..$ N0: int 38
##   .. ..$ T0: num 19
##   ..- attr(*, "opts")=List of 8
##   .. ..$ zeta.omega      : num 10.2
##   .. ..$ zeta.lambda     : num 5.49e-06
##   .. ..$ omega.intercept : logi TRUE
##   .. ..$ lambda.intercept: logi TRUE
##   .. ..$ update.omega    : logi TRUE
##   .. ..$ update.lambda   : logi TRUE
##   .. ..$ min.decrease    : num 5.49e-05
##   .. ..$ max.iter        : num 10000
##  $ difp    : 'synthdid_estimate' num [1, 1] -11.1
##   ..- attr(*, "estimator")= chr "synthdid_estimate"
##   ..- attr(*, "weights")=List of 4
##   .. ..$ lambda    : num [1:19] 0.0526 0.0526 0.0526 0.0526 0.0526 ...
##   .. ..$ omega     : num [1:38] 0 0.0018 0.0969 0.2664 0 ...
##   .. ..$ omega.vals: num [1:10000] 1.46 1.38 1.36 1.35 1.34 ...
##   .. ..$ vals      : num [1:10000] 1.46 1.38 1.36 1.35 1.34 ...
##   ..- attr(*, "setup")=List of 4
##   .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
##   .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
##   .. ..$ X : logi[1:39, 1:31, 0 ] 
##   .. ..$ N0: int 38
##   .. ..$ T0: num 19
##   ..- attr(*, "opts")=List of 8
##   .. ..$ zeta.omega      : num 5.49e-06
##   .. ..$ zeta.lambda     : num 5.49e-06
##   .. ..$ omega.intercept : logi TRUE
##   .. ..$ lambda.intercept: logi TRUE
##   .. ..$ update.omega    : logi TRUE
##   .. ..$ update.lambda   : logi FALSE
##   .. ..$ min.decrease    : num 5.49e-05
##   .. ..$ max.iter        : num 10000
##  $ mc      : num -20.2
##  $ sc_reg  : 'synthdid_estimate' num [1, 1] -21.7
##   ..- attr(*, "estimator")= chr "sc_estimate"
##   ..- attr(*, "weights")=List of 4
##   .. ..$ lambda    : num [1:19] 0 0 0 0 0 0 0 0 0 0 ...
##   .. ..$ omega     : num [1:38] 0 0 0.0857 0.0647 0 ...
##   .. ..$ omega.vals: num [1:10000] 26.4 22.2 21.9 20.6 20.3 ...
##   .. ..$ vals      : num [1:10000] 26.4 22.2 21.9 20.6 20.3 ...
##   ..- attr(*, "setup")=List of 4
##   .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
##   .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
##   .. ..$ X : logi[1:39, 1:31, 0 ] 
##   .. ..$ N0: int 38
##   .. ..$ T0: num 19
##   ..- attr(*, "opts")=List of 8
##   .. ..$ zeta.omega      : num 10.2
##   .. ..$ zeta.lambda     : num 5.49e-06
##   .. ..$ omega.intercept : logi FALSE
##   .. ..$ lambda.intercept: logi TRUE
##   .. ..$ update.omega    : logi TRUE
##   .. ..$ update.lambda   : logi FALSE
##   .. ..$ min.decrease    : num 5.49e-05
##   .. ..$ max.iter        : num 10000
##  $ difp_reg: 'synthdid_estimate' num [1, 1] -16.1
##   ..- attr(*, "estimator")= chr "synthdid_estimate"
##   ..- attr(*, "weights")=List of 4
##   .. ..$ lambda    : num [1:19] 0.0526 0.0526 0.0526 0.0526 0.0526 ...
##   .. ..$ omega     : num [1:38] 0 0.00344 0.05751 0.07829 0.07037 ...
##   .. ..$ omega.vals: num [1:10000] 13.2 12.5 12.1 11.7 11.6 ...
##   .. ..$ vals      : num [1:10000] 13.2 12.5 12.1 11.7 11.6 ...
##   ..- attr(*, "setup")=List of 4
##   .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
##   .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
##   .. ..$ X : logi[1:39, 1:31, 0 ] 
##   .. ..$ N0: int 38
##   .. ..$ T0: num 19
##   ..- attr(*, "opts")=List of 8
##   .. ..$ zeta.omega      : num 10.2
##   .. ..$ zeta.lambda     : num 5.49e-06
##   .. ..$ omega.intercept : logi TRUE
##   .. ..$ lambda.intercept: logi TRUE
##   .. ..$ update.omega    : logi TRUE
##   .. ..$ update.lambda   : logi FALSE
##   .. ..$ min.decrease    : num 5.49e-05
##   .. ..$ max.iter        : num 10000

synthdid_plot(estimates[1:3], facet.vertical = FALSE,
              control.name = 'control', treated.name = 'california',
              lambda.comparable = TRUE, se.method = 'none',
              trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
              trajectory.alpha = .7, effect.alpha = .7,
              diagram.alpha = 1, onset.alpha = .7) +
  theme(legend.position = c(.26, .07), 
        legend.direction = 'horizontal',
        legend.key = element_blank(), 
        legend.background = element_blank(),
        strip.background = element_blank(), 
        strip.text.x = element_blank())

## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

california.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california.table) = c('estimate', 'standard error')
colnames(california.table) = toupper(names(estimators))

round(california.table, digits=1)

##                  DID    SC  SDID  DIFP    MC SC_REG DIFP_REG
## estimate       -27.3 -19.6 -15.6 -11.1 -20.2  -21.7    -16.1
## standard error  17.7   9.9   8.4   9.5  11.5   10.0      9.2

ii - restricted donor pool

summary(california_prop99)

##          State           Year      PacksPerCapita     treated        
##  Alabama    :  31   Min.   :1970   Min.   : 40.7   Min.   :0.000000  
##  Arkansas   :  31   1st Qu.:1977   1st Qu.:100.9   1st Qu.:0.000000  
##  California :  31   Median :1985   Median :116.3   Median :0.000000  
##  Colorado   :  31   Mean   :1985   Mean   :118.9   Mean   :0.009926  
##  Connecticut:  31   3rd Qu.:1993   3rd Qu.:130.5   3rd Qu.:0.000000  
##  Delaware   :  31   Max.   :2000   Max.   :296.2   Max.   :1.000000  
##  (Other)    :1023

library(dplyr)
california_minus2 <- california_prop99 %>%
  filter(! State %in% c("Nevada", "New Hampshire"))

data('california_minus2')

## Warning in data("california_minus2"): data set 'california_minus2' not found

setup <- synthdid::panel.matrices(california_minus2)

estimates <- lapply(estimators, function(estimator) {
  estimator(setup$Y, setup$N0, setup$T0)
})

synthdid_plot(estimates[1:3], facet.vertical = FALSE,
              control.name = 'control', treated.name = 'california',
              lambda.comparable = TRUE, se.method = 'none',
              trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
              trajectory.alpha = .7, effect.alpha = .7,
              diagram.alpha = 1, onset.alpha = .7) +
  theme(legend.position = c(.26, .07), 
        legend.direction = 'horizontal',
        legend.key = element_blank(), 
        legend.background = element_blank(),
        strip.background = element_blank(), 
        strip.text.x = element_blank())

synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

california2.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california2.table) = c('estimate', 'standard error')
colnames(california2.table) = toupper(names(estimators))

round(california2.table, digits=1)

##                  DID    SC  SDID  DIFP    MC SC_REG DIFP_REG
## estimate       -30.1 -21.7 -19.5 -16.7 -24.0  -23.4    -20.6
## standard error  17.7   9.9   8.4   9.5  11.5   10.0      9.2

The point estimates are generally larger than before, aka more biased. Nevada and NH had larger weight in constructing the counterfactual previously, and eliminating them will reduce the “goodness of fit” of the synthetic California.

ii - restricted pre-treatment

california3 <- california_prop99 %>%
  filter(Year > 1974)

setup <- synthdid::panel.matrices(california3)

estimates <- lapply(estimators, function(estimator) {
  estimator(setup$Y, setup$N0, setup$T0)
})

synthdid_plot(estimates[1:3], facet.vertical = FALSE,
              control.name = 'control', treated.name = 'california',
              lambda.comparable = TRUE, se.method = 'none',
              trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
              trajectory.alpha = .7, effect.alpha = .7,
              diagram.alpha = 1, onset.alpha = .7) +
  theme(legend.position = c(.26, .07), 
        legend.direction = 'horizontal',
        legend.key = element_blank(), 
        legend.background = element_blank(),
        strip.background = element_blank(), 
        strip.text.x = element_blank())

synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

california3.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california3.table) = c('estimate', 'standard error')
colnames(california3.table) = toupper(names(estimators))

round(california3.table, digits=1)

##                  DID    SC  SDID  DIFP    MC SC_REG DIFP_REG
## estimate       -23.7 -20.3 -17.9 -19.6 -20.6  -23.1    -18.4
## standard error  17.7   9.9   8.4   9.5  11.5   10.0      9.2

DID estimate becomes insignificant, and the other estimates are larger than before. This is reasonable given cig consumption show reverse trend (going from low to high) than the rest of the periods pre 1974 periods. The low values pulled the pre-treated Y down, so eliminating the portion allows a more linear fit of the trends and captures a larger TE.

ii - restricted post treatment

california4 <- california_prop99 %>%
  filter(Year < 1996)

setup <- synthdid::panel.matrices(california4)

estimates <- lapply(estimators, function(estimator) {
  estimator(setup$Y, setup$N0, setup$T0)
})

synthdid_plot(estimates[1:3], facet.vertical = FALSE,
              control.name = 'control', treated.name = 'california',
              lambda.comparable = TRUE, se.method = 'none',
              trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
              trajectory.alpha = .7, effect.alpha = .7,
              diagram.alpha = 1, onset.alpha = .7) +
  theme(legend.position = c(.26, .07), 
        legend.direction = 'horizontal',
        legend.key = element_blank(), 
        legend.background = element_blank(),
        strip.background = element_blank(), 
        strip.text.x = element_blank())

synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

california4.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california4.table) = c('estimate', 'standard error')
colnames(california4.table) = toupper(names(estimators))

round(california4.table, digits=1)

##                  DID    SC SDID DIFP    MC SC_REG DIFP_REG
## estimate       -22.2 -15.3 -9.8 -8.0 -14.2  -17.1    -10.8
## standard error  17.7   9.9  8.4  9.5  11.5   10.0      9.2

Estimates become smaller insignificant, which makes sense given that the distance between treated and synthetic diverge after the treatment kicks in. Cutting off the treatment periods eliminates the most informative points with larger changes.

Problem 2

Research question: How did the surge in “tariffs” web search affect consumer price index in the US, Canada, and Mexico?

library(readxl)
data_monthly <- read_excel("Applied_Econometrics.xlsx", sheet = "Monthly")
data_quarterly <- read_excel("Applied_Econometrics.xlsx", sheet = "Quarterly")
## Merge monthly data with quarterly-assign quarterly value to corresponding month

library(dplyr)
library(tidyr)
library(lubridate)

monthly_data <- data_quarterly %>%
  mutate(
    quarter_start = floor_date(observation_date, unit = "quarter")
  ) %>%
  rowwise() %>%
  mutate(
    months = list(seq(quarter_start, by = "1 month", length.out = 3))
  ) %>%
  ungroup() %>%
  select(-observation_date, -quarter_start) %>%
  unnest(cols = months) %>%
  rename(observation_date = months) %>%
  select(observation_date, Rgdp_Canada, Rgdp_Mexico, Unemployment_mexico)

head(monthly_data)

## # A tibble: 6 × 4
##   observation_date    Rgdp_Canada Rgdp_Mexico Unemployment_mexico
##   <dttm>                    <dbl>       <dbl>               <dbl>
## 1 2020-04-01 00:00:00     492032.    4797408.                4.92
## 2 2020-05-01 00:00:00     492032.    4797408.                4.92
## 3 2020-06-01 00:00:00     492032.    4797408.                4.92
## 4 2020-07-01 00:00:00     536725.    5538841                 4.84
## 5 2020-08-01 00:00:00     536725.    5538841                 4.84
## 6 2020-09-01 00:00:00     536725.    5538841                 4.84

monthly_full <- monthly_data %>%
  left_join(data_monthly, by = "observation_date") %>%
  select(-Unemployment_MX)

head(monthly_full)

## # A tibble: 6 × 11
##   observation_date    Rgdp_Canada Rgdp_Mexico Unemployment_mexico CPI_Canada
##   <dttm>                    <dbl>       <dbl>               <dbl>      <dbl>
## 1 2020-04-01 00:00:00     492032.    4797408.                4.92       107.
## 2 2020-05-01 00:00:00     492032.    4797408.                4.92       108.
## 3 2020-06-01 00:00:00     492032.    4797408.                4.92       108.
## 4 2020-07-01 00:00:00     536725.    5538841                 4.84       108.
## 5 2020-08-01 00:00:00     536725.    5538841                 4.84       108.
## 6 2020-09-01 00:00:00     536725.    5538841                 4.84       108.
## # ℹ 6 more variables: CPI_US <dbl>, Income_US <dbl>, Unemployment_Canada <dbl>,
## #   CPI_Mexico <dbl>, US_Consumption <dbl>, Unemployment_US <dbl>

search_US <- read.csv("USTimeline.csv") %>%
  rename(searches = "Tariffs...United.States.") %>%
   mutate(
    searches = if_else(searches == "<1", "0", searches),
    searches = as.numeric(searches)) %>%
  mutate(Week = as.Date(Week)) %>%
  mutate(observation_date = floor_date(Week, unit = "month")) %>%
  group_by(observation_date) %>%
  summarise(US_search = sum(searches, na.rm = TRUE))

head(search_US)

## # A tibble: 6 × 2
##   observation_date US_search
##   <date>               <dbl>
## 1 2020-04-01               1
## 2 2020-05-01               2
## 3 2020-06-01               0
## 4 2020-07-01               0
## 5 2020-08-01               0
## 6 2020-09-01               4

search_Canada <- read.csv("CanadaTimeline.csv") %>%
  rename(searches = "Tariffs...Canada.") %>%
   mutate(
    searches = if_else(searches == "<1", "0", searches),
    searches = as.numeric(searches)) %>%
  mutate(Week = as.Date(Week)) %>%
  mutate(observation_date = floor_date(Week, unit = "month")) %>%
  group_by(observation_date) %>%
  summarise(Canada_search = sum(searches, na.rm = TRUE))

head(search_Canada)

## # A tibble: 6 × 2
##   observation_date Canada_search
##   <date>                   <dbl>
## 1 2020-04-01                   0
## 2 2020-05-01                   0
## 3 2020-06-01                   0
## 4 2020-07-01                   0
## 5 2020-08-01                   3
## 6 2020-09-01                   1

search_Mexico <- read.csv("MexicoTimeline.csv") %>%
  rename(searches = "Tariffs...Mexico.") %>%
   mutate(
    searches = as.character(searches), 
    searches = if_else(searches == "<1", "0", searches),
    searches = as.numeric(searches)) %>%
  mutate(Week = as.Date(Week)) %>%
  mutate(observation_date = floor_date(Week, unit = "month")) %>%
  group_by(observation_date) %>%
  summarise(Mexico_search = sum(searches, na.rm = TRUE))

head(search_Mexico)

## # A tibble: 6 × 2
##   observation_date Mexico_search
##   <date>                   <dbl>
## 1 2020-04-01                   0
## 2 2020-05-01                   0
## 3 2020-06-01                   0
## 4 2020-07-01                   0
## 5 2020-08-01                   0
## 6 2020-09-01                   0

full_data <- monthly_full %>%
  left_join(search_Mexico, by = "observation_date") %>%
  left_join(search_Canada, by = "observation_date") %>%
  left_join(search_US, by = "observation_date")

head(full_data)

## # A tibble: 6 × 14
##   observation_date    Rgdp_Canada Rgdp_Mexico Unemployment_mexico CPI_Canada
##   <dttm>                    <dbl>       <dbl>               <dbl>      <dbl>
## 1 2020-04-01 00:00:00     492032.    4797408.                4.92       107.
## 2 2020-05-01 00:00:00     492032.    4797408.                4.92       108.
## 3 2020-06-01 00:00:00     492032.    4797408.                4.92       108.
## 4 2020-07-01 00:00:00     536725.    5538841                 4.84       108.
## 5 2020-08-01 00:00:00     536725.    5538841                 4.84       108.
## 6 2020-09-01 00:00:00     536725.    5538841                 4.84       108.
## # ℹ 9 more variables: CPI_US <dbl>, Income_US <dbl>, Unemployment_Canada <dbl>,
## #   CPI_Mexico <dbl>, US_Consumption <dbl>, Unemployment_US <dbl>,
## #   Mexico_search <dbl>, Canada_search <dbl>, US_search <dbl>

#install.packages("zoo")
library(zoo)
# Assuming your data is merged and called `monthly_data`
reg_data <- full_data %>%
  mutate(observation_date = ymd(as.character(observation_date))) %>%
  mutate(
    event_month = as.Date("2024-10-01"),
    relative_time = as.integer((observation_date - event_month) / 30)) 

head(reg_data)

## # A tibble: 6 × 16
##   observation_date Rgdp_Canada Rgdp_Mexico Unemployment_mexico CPI_Canada CPI_US
##   <date>                 <dbl>       <dbl>               <dbl>      <dbl>  <dbl>
## 1 2020-04-01           492032.    4797408.                4.92       107.   108.
## 2 2020-05-01           492032.    4797408.                4.92       108.   108.
## 3 2020-06-01           492032.    4797408.                4.92       108.   108.
## 4 2020-07-01           536725.    5538841                 4.84       108.   109.
## 5 2020-08-01           536725.    5538841                 4.84       108.   109.
## 6 2020-09-01           536725.    5538841                 4.84       108.   110.
## # ℹ 10 more variables: Income_US <dbl>, Unemployment_Canada <dbl>,
## #   CPI_Mexico <dbl>, US_Consumption <dbl>, Unemployment_US <dbl>,
## #   Mexico_search <dbl>, Canada_search <dbl>, US_search <dbl>,
## #   event_month <date>, relative_time <int>

Static DID

library(dplyr)
library(lubridate)

# Set the event date: October 2024
event_date <- ymd("2024-10-01")

# Prepare your data, creating the 'post_event' variable
reg_data <- reg_data %>%
  mutate(
    date = ymd(observation_date),  # Ensure the 'date' column is in Date format
    post_event = if_else(observation_date >= event_date, 1, 0)  # 1 if after event, 0 if before
  )

library(fixest)

# Run the DID regression model
DID_model <- feols(
  CPI_US ~ US_search * post_event + US_Consumption + Unemployment_US, 
  data = reg_data
)

# Summarize the results
summary(DID_model)

## OLS estimation, Dep. Var.: CPI_US
## Observations: 57
## Standard-errors: IID 
##                       Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)          29.073424   3.637384  7.992950 1.4902e-10 ***
## US_search             0.013319   0.102086  0.130469 8.9671e-01    
## post_event           -2.127462   1.615286 -1.317081 1.9370e-01    
## US_Consumption        0.005097   0.000178 28.681504  < 2.2e-16 ***
## Unemployment_US       0.821262   0.137465  5.974326 2.2501e-07 ***
## US_search:post_event -0.015629   0.129373 -0.120807 9.0432e-01    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 1.26919   Adj. R2: 0.975796

library(fixest)

US_model <- feols(
  CPI_US ~ i(relative_time, ref = -1)*US_search + US_Consumption + Unemployment_US, 
  data = reg_data
)

## The variables 'US_search', 'Unemployment_US', 'US_search:relative_time::-54', 'US_search:relative_time::-53', 'US_search:relative_time::-52', 'US_search:relative_time::-51' and 52 others have been removed because of collinearity (see $collin.var).

summary(US_model)

## Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).

## OLS estimation, Dep. Var.: CPI_US
## Observations: 57
## Standard-errors: IID 
##                    Estimate Std. Error       t value   Pr(>|t|)    
## (Intercept)          256.00   0.000010  2.560000e+07 2.4868e-08 ***
## relative_time::-54   -32.00   0.045685 -7.004438e+02 9.0888e-04 ***
## relative_time::-53   -40.00   0.048417 -8.261570e+02 7.7058e-04 ***
## relative_time::-52   -48.00   0.020484 -2.343287e+03 2.7168e-04 ***
## relative_time::-51   -32.00   0.027332 -1.170783e+03 5.4376e-04 ***
## relative_time::-50   -32.00   0.066064 -4.843758e+02 1.3143e-03 ** 
## relative_time::-49   -32.00   0.056811 -5.632675e+02 1.1302e-03 ** 
## relative_time::-48   -48.00   0.036301 -1.322262e+03 4.8146e-04 ***
## relative_time::-47   -64.00   0.055262 -1.158122e+03 5.4970e-04 ***
## relative_time::-46   -32.00   0.004757 -6.727498e+03 9.4630e-05 ***
## relative_time::-45   -16.00   0.013558 -1.180142e+03 5.3944e-04 ***
## relative_time::-44   -40.00   0.069664 -5.741869e+02 1.1087e-03 ** 
## relative_time::-43   -56.00   0.054304 -1.031222e+03 6.1734e-04 ***
## relative_time::-42   -32.00   0.054469 -5.874880e+02 1.0836e-03 ** 
## relative_time::-41   -48.00   0.039246 -1.223046e+03 5.2052e-04 ***
## relative_time::-40   -32.00   0.015460 -2.069913e+03 3.0756e-04 ***
## relative_time::-39   -48.00   0.008608 -5.576093e+03 1.1417e-04 ***
## relative_time::-38   -24.00   0.021466 -1.118044e+03 5.6940e-04 ***
## relative_time::-37   -32.00   0.006622 -4.832633e+03 1.3173e-04 ***
## relative_time::-36    -8.00   0.018480 -4.328999e+02 1.4706e-03 ** 
## relative_time::-35   -32.00   0.004786 -6.686039e+03 9.5216e-05 ***
## relative_time::-34   -16.00   0.114895 -1.392572e+02 4.5715e-03 ** 
## relative_time::-33   -24.00   0.073169 -3.280065e+02 1.9409e-03 ** 
## relative_time::-32   -28.00   0.084084 -3.330009e+02 1.9118e-03 ** 
## relative_time::-31   -16.00   0.012959 -1.234621e+03 5.1564e-04 ***
## relative_time::-30   -16.00   0.058659 -2.727613e+02 2.3340e-03 ** 
## relative_time::-29   -32.00   0.026110 -1.225600e+03 5.1944e-04 ***
## relative_time::-28   -28.00   0.016236 -1.724536e+03 3.6915e-04 ***
## relative_time::-27   -20.00   0.033208 -6.022620e+02 1.0570e-03 ** 
## relative_time::-26   -16.00   0.024596 -6.505215e+02 9.7863e-04 ***
## relative_time::-25   -16.00   0.011101 -1.441321e+03 4.4169e-04 ***
## relative_time::-24   -12.00   0.037774 -3.176789e+02 2.0040e-03 ** 
## relative_time::-23   -16.00   0.014990 -1.067345e+03 5.9645e-04 ***
## relative_time::-22    -8.00   0.050513 -1.583764e+02 4.0196e-03 ** 
## relative_time::-21   -14.00   0.022056 -6.347487e+02 1.0029e-03 ** 
## relative_time::-20   -18.00   0.027117 -6.637994e+02 9.5905e-04 ***
## relative_time::-19   -14.00   0.042488 -3.295040e+02 1.9320e-03 ** 
## relative_time::-18    -6.00   0.024212 -2.478091e+02 2.5690e-03 ** 
## relative_time::-17    -8.00   0.056730 -1.410183e+02 4.5144e-03 ** 
## relative_time::-16    -4.00   0.013915 -2.874650e+02 2.2146e-03 ** 
## relative_time::-15    -6.00   0.034634 -1.732397e+02 3.6747e-03 ** 
## relative_time::-14   -12.00   0.030003 -3.999653e+02 1.5917e-03 ** 
## relative_time::-13    -4.00   0.006861 -5.829862e+02 1.0920e-03 ** 
## relative_time::-12    -4.00   0.005416 -7.385876e+02 8.6194e-04 ***
## relative_time::-11    -7.00   0.006559 -1.067157e+03 5.9656e-04 ***
## relative_time::-10    -6.00   0.007932 -7.564634e+02 8.4157e-04 ***
## relative_time::-9     -3.00   0.019622 -1.528897e+02 4.1639e-03 ** 
## relative_time::-8     -6.00   0.000054 -1.105491e+05 5.7587e-06 ***
## relative_time::-7     -1.00   0.018653 -5.361132e+01 1.1873e-02 *  
## relative_time::-6     -5.00   0.017828 -2.804526e+02 2.2700e-03 ** 
## relative_time::-5     -1.00   0.006592 -1.517038e+02 4.1964e-03 ** 
## relative_time::-4     -3.50   0.002980 -1.174520e+03 5.4203e-04 ***
## relative_time::-3     -0.25   0.009188 -2.720922e+01 2.3387e-02 *  
## relative_time::-2      0.25   0.005507  4.539377e+01 1.4022e-02 *  
## relative_time::0       0.50   0.006404  7.807192e+01 8.1538e-03 ** 
## relative_time::1       1.00   0.004197  2.382410e+02 2.6722e-03 ** 
## relative_time::2       2.00   0.004428  4.517188e+02 1.4093e-03 ** 
## US_Consumption         0.00  22.193701  0.000000e+00 1.0000e+00    
## ... 58 variables were removed because of collinearity (US_search, Unemployment_US and 56 others [full set in $collin.var])
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 115.0   Adj. R2: 10,131.7

coef_data <- coef(US_model)

model_summary <- summary(US_model)

## Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).

#se_data <- model_summary$coefficients[, 2]  # The standard errors are in the second column

coef_df <- data.frame(
  relative_time = names(coef_data),   # Coefficients are named by the terms in the model
  coefficient = coef_data
#  std_error = se_data,
#  conf_lower = coef_data - 1.96 * se_data,  # 95% confidence interval lower bound
#  conf_upper = coef_data + 1.96 * se_data   # 95% confidence interval upper bound
)

# Remove NA or non-numeric rows if any
coef_df <- na.omit(coef_df)

ggplot(coef_df, aes(x = relative_time, y = coefficient)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  theme_minimal() +
  labs(
    title = "Event Study: Impact of 'Tariffs Web Searches' on CPI",
    x = "Relative Time",
    y = "Effect on CPI",
    subtitle = "Using Fixed-Effects Model"
  ) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "black")

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

Does not have enough datapoints post event. Macro data not found for 2025.

Problem 3

i.

Yes, it does. If the conditional mean and fall rank requirements are satisfied, OLS is the BEST estimator. The term I in the Ridge estimator itself introduces bias by nature, with >0, it shrinks the coefficient toward 0, causing bias.

ii.

housingdata <- readRDS("testdata20250121.RDS")

(a)

library(tidyr)
library(wooldridge)

## Warning: package 'wooldridge' was built under R version 4.4.3

## 
## Attaching package: 'wooldridge'

## The following object is masked from 'package:MASS':
## 
##     cement

library(fixest)
library(multiwayvcov)
library(lmtest)
library(stargazer)
library(modelsummary)
library(causalweight)
library(fixest)
library(dplyr)
library(lubridate)

housingdata <- housingdata %>%
  mutate(sale_date = ymd(as.character(sale_date)))


#Separate columns
housingdata <- housingdata %>%
  mutate(
    year = year(sale_date),
    month = month(sale_date),
    day = day(sale_date)
  )

#Add log of land square footage
housingdata$log_land_square_footage <- log(housingdata$land_square_footage)

#Add LOG SALE AMOUNT
housingdata$log_sale_amount <- log(housingdata$sale_amount)

training <- subset(housingdata, !(abbr %in% c("NY", "CO", "CA")))
test <- subset(housingdata, abbr %in% c("NY", "CO", "CA"))

(b)

# Load necessary libraries
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-1

library(caret)

## Warning: package 'caret' was built under R version 4.4.2

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

# Define response and predictors
response <- "log_sale_amount"
predictors <- c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
                "number_of_fireplaces", "stories_number", 
                "log_land_square_footage", "AC_presence")

# Prepare data: training dataset
x_train <- as.matrix(as.data.frame(training)[, predictors])
y_train <- as.data.frame(training)[, response]

# Set seed for reproducibility
set.seed(123)

# Perform LASSO with 5-fold cross-validation
cv_lasso <- cv.glmnet(
  x = x_train,
  y = y_train,
  alpha = 1,                 # alpha = 1 for LASSO
  nfolds = 5,
  type.measure = "mse"
)

# Plot cross-validation results
plot(cv_lasso)

(c)

# Best lambda (minimizing mse)
best_lambda <- cv_lasso$lambda.min
cat("Best lambda:", best_lambda, "\n")

## Best lambda: 0.001110023

# Coefficients of best model
coef(cv_lasso, s = "lambda.min")

## 8 x 1 sparse Matrix of class "dgCMatrix"
##                                   1
## (Intercept)             5.261725555
## year_built              0.002818082
## bedrooms_all_buildings  0.025832984
## number_of_bathrooms     0.241310506
## number_of_fireplaces    .          
## stories_number          0.175381566
## log_land_square_footage 0.051093076
## AC_presence             0.025023535

y_mean <- mean(training$sale_amount)
print(y_mean)

## [1] 304779.4

The best lambda is 0.001110023. MWTP = 0.025*304779.4 = 7619.49.

(d)

# Filter test dataset: with AC and without AC
test_with_ac <- test[test$AC_presence == 1, ]
test_without_ac <- test[test$AC_presence == 0, ]

# Compute average values of predictors for both groups
avg_with_ac <- colMeans(test_with_ac[, ..predictors], na.rm = TRUE)
avg_without_ac <- colMeans(test_without_ac[, ..predictors], na.rm = TRUE)

# Retrieve LASSO coefficients (already fitted)
lasso_coef <- coef(cv_lasso, s = "lambda.min")
coef_names <- rownames(lasso_coef)
coef_values <- as.numeric(lasso_coef)

# Convert coefficients to named vector for easy lookup
beta <- setNames(coef_values, coef_names)

# Add intercept
intercept <- beta["(Intercept)"]

# Compute predicted log prices
pred_log_with_ac <- intercept + sum(beta[names(avg_with_ac)] * avg_with_ac)
pred_log_without_ac <- intercept + sum(beta[names(avg_without_ac)] * avg_without_ac)

# Convert from log price to price
pred_price_with_ac <- exp(pred_log_with_ac)
pred_price_without_ac <- exp(pred_log_without_ac)

# Compute difference in predicted price
price_diff <- pred_price_with_ac - pred_price_without_ac

# Output results
cat("Predicted price with AC:", round(pred_price_with_ac, 2), "\n")

## Predicted price with AC: 214771.4

cat("Predicted price without AC:", round(pred_price_without_ac, 2), "\n")

## Predicted price without AC: 193428.4

cat("Difference in predicted price:", round(price_diff, 2), "\n")

## Difference in predicted price: 21343

Difference in predicted house price is $21343.

(e)

obs_diff <- test %>%
  group_by(AC_presence) %>%
  summarise(mean_observed_price = mean(sale_amount, na.rm = TRUE)) %>%
  summarise(observed_diff = diff(mean_observed_price))  # with AC - without AC

print(obs_diff)

## # A tibble: 1 × 1
##   observed_diff
##           <dbl>
## 1       -88846.

Observed difference in price = -88845.84, much larger than the LASSO predicted.

The values are not close as predicted.

(f)

# --------------------------
# New York
# --------------------------
ny_data <- test[abbr == "NY"]

ny_model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms +
               number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
               data = ny_data)

ny_ac_coef <- coef(ny_model)["AC_presence"]
ny_avg_price <- exp(mean(ny_data$log_sale_amount, na.rm = TRUE))
ny_mwtp <- ny_ac_coef * ny_avg_price

cat("\n--- New York ---\n")

## 
## --- New York ---

cat("AC Coefficient (log):", round(ny_ac_coef, 4), "\n")

## AC Coefficient (log): 0.0895

cat("MWTP for AC ($):", round(ny_mwtp, 2), "\n")

## MWTP for AC ($): 45273.02

# --------------------------
# Colorado
# --------------------------
co_data <- test[abbr == "CO"]

co_model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms +
               number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
               data = co_data)

co_ac_coef <- coef(co_model)["AC_presence"]
co_avg_price <- exp(mean(co_data$log_sale_amount, na.rm = TRUE))
co_mwtp <- co_ac_coef * co_avg_price

cat("\n--- Colorado ---\n")

## 
## --- Colorado ---

cat("AC Coefficient (log):", round(co_ac_coef, 4), "\n")

## AC Coefficient (log): 0.0603

cat("MWTP for AC ($):", round(co_mwtp, 2), "\n")

## MWTP for AC ($): 21814.07

# --------------------------
# California
# --------------------------
ca_data <- test[abbr == "CA"]

ca_model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms +
               number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
               data = ca_data)

ca_ac_coef <- coef(ca_model)["AC_presence"]
ca_avg_price <- exp(mean(ca_data$log_sale_amount, na.rm = TRUE))
ca_mwtp <- ca_ac_coef * ca_avg_price

cat("\n--- California ---\n")

## 
## --- California ---

cat("AC Coefficient (log):", round(ca_ac_coef, 4), "\n")

## AC Coefficient (log): -0.3004

cat("MWTP for AC ($):", round(ca_mwtp, 2), "\n")

## MWTP for AC ($): -138097.2

The marginal willingness to pay is very different for all the states compared to the overall value in part c.

Part iii.

According to FWL theorem, both claims are correct.

Part iv. Debiased ML

install.packages("causaldrf")

## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'

## package 'causaldrf' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\zxl5576\AppData\Local\Temp\RtmpyO7CmZ\downloaded_packages

library(causalweight)
library(glmnet)
library(causaldrf)

## Warning: package 'causaldrf' was built under R version 4.4.3

CA_data <- test[abbr == "CA"]

# Define treatment variable (AC presence) and outcome variable (log_sale_amount)
treatment <- CA_data$AC_presence
outcome <- CA_data$log_sale_amount

# Define predictor columns in correct order
predictors <- c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
                "number_of_fireplaces", "stories_number", 
                "log_land_square_footage", "AC_presence")

# Filter CA dataset: with AC and without AC
CA_data_with_ac <- CA_data[CA_data$AC_presence == 1, ]
CA_data_without_ac <- CA_data[CA_data$AC_presence == 0, ]

# Compute average values of predictors for both groups
avg_with_ac <- colMeans(CA_data_with_ac[, ..predictors], na.rm = TRUE)
avg_without_ac <- colMeans(CA_data_without_ac[, ..predictors], na.rm = TRUE)

# Retrieve LASSO coefficients (already fitted)
lasso_coef <- coef(cv_lasso, s = "lambda.min")
coef_names <- rownames(lasso_coef)
coef_values <- as.numeric(lasso_coef)

# Convert coefficients to named vector for easy lookup
beta <- setNames(coef_values, coef_names)

# Add intercept (should match the intercept in the LASSO model)
intercept <- beta["(Intercept)"]

# Compute predicted log prices for homes with and without AC
pred_log_with_ac <- intercept + sum(beta[names(avg_with_ac)] * avg_with_ac)
pred_log_without_ac <- intercept + sum(beta[names(avg_without_ac)] * avg_without_ac)

# Convert from log price to actual price
pred_price_with_ac <- exp(pred_log_with_ac)
pred_price_without_ac <- exp(pred_log_without_ac)

# Compute difference in predicted prices
price_diff <- pred_price_with_ac - pred_price_without_ac

# Output the results
cat("Predicted price with AC:", round(pred_price_with_ac, 2), "\n")

## Predicted price with AC: 205505.3

cat("Predicted price without AC:", round(pred_price_without_ac, 2), "\n")

## Predicted price without AC: 177486.3

cat("Difference in predicted price:", round(price_diff, 2), "\n")

## Difference in predicted price: 28019.02

outcome <- "log_sale_amount"  # Example outcome variable
treatment <- "AC_presence"    # Treatment variable (whether the house has AC)
covariates <- c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
                "number_of_fireplaces", "stories_number", 
                "log_land_square_footage")  # List of covariates to control for

# Prepare the data (CA_data is your dataset)
X <- as.matrix(CA_data[, covariates, with = FALSE])  # Covariates matrix
Y <- CA_data[[outcome]]  # Outcome variable
D <- CA_data[[treatment]]  # Treatment variable

# Set the seed for reproducibility
set.seed(123)

# Apply Double Machine Learning (DML) using LASSO
dml_model <- treatDML(y = Y, 
                      d = D, 
                      x = X, 
                      MLmethod = "lasso", 
                      k = 3)  # 3-fold cross-validation

## Loading required package: nnls

# Extract the marginal willingness to pay (MWTP) estimate
mwtp <- dml_model$theta
#cat("Marginal Willingness to Pay (MWTP) for AC:", round(mwtp, 4), "\n")

PS 6

2025-04-24