#Libraries
# Load packages
library(pacman)
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot,
cobalt, Matchit, Zelig, estimatr, cem, tidyverse,
lubridate, usmap, gridExtra, stringr, readxl, plot3D,
cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,
foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont,
kableExtra, snakecase, janitor)
library(fixest)
##
## Attaching package: 'fixest'
## The following object is masked from 'package:scales':
##
## pvalue
## The following object is masked from 'package:Jmisc':
##
## demean
library(multiwayvcov)
library(lmtest)
library(stargazer)
library(modelsummary)
library(causalweight)
## Loading required package: ranger
library(fixest)
install.packages('devtools')
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'devtools' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\zxl5576\AppData\Local\Temp\RtmpyO7CmZ\downloaded_packages
devtools::install_github('synth-inference/synthdid', ref='sdid-paper')
## Using GitHub PAT from the git credential store.
## Skipping install of 'synthdid' from a github remote, the SHA1 (36923192) has not changed since last install.
## Use `force = TRUE` to force installation
install.packages(c('dplyr', 'tidyr', 'tibble', 'ggplot2', 'devtools', 'xtable'))
## Warning: packages 'dplyr', 'tidyr', 'tibble', 'ggplot2' are in use and will not
## be installed
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'devtools' successfully unpacked and MD5 sums checked
## package 'xtable' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\zxl5576\AppData\Local\Temp\RtmpyO7CmZ\downloaded_packages
devtools::install_github('cran/glmnet', ref='f4fc95ab49efaad9b6e1728a7c840bc6159501dc')
## Using GitHub PAT from the git credential store.
## Skipping install of 'glmnet' from a github remote, the SHA1 (f4fc95ab) has not changed since last install.
## Use `force = TRUE` to force installation
devtools::install_github('susanathey/MCPanel', ref='6b2706fd7c35f3266048ceb22a7e9a61ae1774da')
## Using GitHub PAT from the git credential store.
## Skipping install of 'MCPanel' from a github remote, the SHA1 (6b2706fd) has not changed since last install.
## Use `force = TRUE` to force installation
library(synthdid)
data('california_prop99')
setup = panel.matrices(california_prop99)
tau.hat = synthdid_estimate(setup$Y, setup$N0, setup$T0)
tau.hat
## synthdid: -15.604 +- NA. Effective N0/N0 = 16.4/38~0.4. Effective T0/T0 = 2.8/19~0.1. N1,T1 = 1,12.
print(summary(tau.hat))
## $estimate
## [1] -15.60383
##
## $se
## [,1]
## [1,] NA
##
## $controls
## estimate 1
## Nevada 0.124
## New Hampshire 0.105
## Connecticut 0.078
## Delaware 0.070
## Colorado 0.058
## Illinois 0.053
## Nebraska 0.048
## Montana 0.045
## Utah 0.042
## New Mexico 0.041
## Minnesota 0.039
## Wisconsin 0.037
## West Virginia 0.034
## North Carolina 0.033
## Idaho 0.031
## Ohio 0.031
## Maine 0.028
## Iowa 0.026
##
## $periods
## estimate 1
## 1988 0.427
## 1986 0.366
## 1987 0.206
##
## $dimensions
## N1 N0 N0.effective T1 T0 T0.effective
## 1.000 38.000 16.388 12.000 19.000 2.783
se = sqrt(vcov(tau.hat, method='placebo'))
sprintf('point estimate: %1.2f', tau.hat)
## [1] "point estimate: -15.60"
sprintf('95%% CI (%1.2f, %1.2f)', tau.hat - 1.96 * se, tau.hat + 1.96 * se)
## [1] "95% CI (-35.41, 4.20)"
plot(tau.hat, se.method='placebo')
library(parallel)
library(doSNOW)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: snow
##
## Attaching package: 'snow'
## The following objects are masked from 'package:parallel':
##
## closeNode, clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, clusterSplit, makeCluster, parApply,
## parCapply, parLapply, parRapply, parSapply, recvData, recvOneData,
## sendData, splitIndices, stopCluster
library(foreach)
# -----------------------------------------
# Set up parallel backend (you can turn this off with paraprocess = 0)
# -----------------------------------------
paraprocess <- 1
if (paraprocess == 1) {
availcores <- detectCores()
numcores <- max(1, floor(availcores / 1.2))
cl <- makeCluster(numcores, type = "SOCK")
registerDoSNOW(cl)
clusterSetRNGStream(cl, 12345) # Ensures reproducibility
}
library(MCPanel)
did_estimate = function(Y, N0, T0) {
synthdid::did_estimate(Y, N0, T0)
}
sc_estimate_reg = function(Y, N0, T0) {
synthdid::sc_estimate(Y, N0, T0, eta.omega = ((nrow(Y) - N0) * (ncol(Y) - T0))^(1/4))
}
sdid_estimate = function(Y, N0, T0) {
synthdid::synthdid_estimate(Y, N0, T0)
}
mc_estimate = function(Y, N0, T0) {
N1=nrow(Y)-N0
T1=ncol(Y)-T0
W <- outer(c(rep(0,N0),rep(1,N1)),c(rep(0,T0),rep(1,T1)))
mc_pred <- mcnnm_cv(Y, 1-W, num_lam_L = 20)
mc_fit <- mc_pred$L + outer(mc_pred$u, mc_pred$v, '+')
mc_est <- sum(W*(Y-mc_fit))/sum(W)
mc_est
}
mc_placebo_se = function(Y, N0, T0, replications=200) {
N1 = nrow(Y) - N0
theta = function(ind) { mc_estimate(Y[ind,], length(ind)-N1, T0) }
sqrt((replications-1)/replications) * sd(replicate(replications, theta(sample(1:N0))))
}
difp_estimate = function(Y, N0, T0) {
synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)), eta.omega=1e-6)
}
difp_estimate_reg = function(Y, N0, T0) {
synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)))
}
estimators = list(did=did_estimate,
sc=sc_estimate,
sdid=synthdid_estimate,
difp=difp_estimate,
mc = mc_estimate,
sc_reg = sc_estimate_reg,
difp_reg = difp_estimate_reg)
library(synthdid)
library(ggplot2)
set.seed(12345)
data('california_prop99')
setup <- synthdid::panel.matrices(california_prop99)
estimates <- lapply(estimators, function(estimator) {
estimator(setup$Y, setup$N0, setup$T0)
})
standard.errors = mapply(function(estimate, name) {
set.seed(12345)
if(name == 'mc') { mc_placebo_se(setup$Y, setup$N0, setup$T0) }
else { sqrt(vcov(estimate, method='placebo')) }
}, estimates, names(estimators))
str(estimates)
## List of 7
## $ did : 'synthdid_estimate' num [1, 1] -27.3
## ..- attr(*, "estimator")= chr "did_estimate"
## ..- attr(*, "weights")=List of 2
## .. ..$ lambda: num [1:19] 0.0526 0.0526 0.0526 0.0526 0.0526 ...
## .. ..$ omega : num [1:38] 0.0263 0.0263 0.0263 0.0263 0.0263 ...
## ..- attr(*, "setup")=List of 4
## .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
## .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
## .. ..$ X : logi[1:39, 1:31, 0 ]
## .. ..$ N0: int 38
## .. ..$ T0: num 19
## ..- attr(*, "opts")=List of 8
## .. ..$ zeta.omega : num 10.2
## .. ..$ zeta.lambda : num 5.49e-06
## .. ..$ omega.intercept : logi TRUE
## .. ..$ lambda.intercept: logi TRUE
## .. ..$ update.omega : logi FALSE
## .. ..$ update.lambda : logi FALSE
## .. ..$ min.decrease : num 5.49e-05
## .. ..$ max.iter : num 10000
## $ sc : 'synthdid_estimate' num [1, 1] -19.6
## ..- attr(*, "estimator")= chr "sc_estimate"
## ..- attr(*, "weights")=List of 4
## .. ..$ lambda : num [1:19] 0 0 0 0 0 0 0 0 0 0 ...
## .. ..$ omega : num [1:38] 0 0 0.01332 0.10447 0.00405 ...
## .. ..$ omega.vals: num [1:10000] 7.13 6.93 6.77 6.64 6.52 ...
## .. ..$ vals : num [1:10000] 7.13 6.93 6.77 6.64 6.52 ...
## ..- attr(*, "setup")=List of 4
## .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
## .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
## .. ..$ X : logi[1:39, 1:31, 0 ]
## .. ..$ N0: int 38
## .. ..$ T0: num 19
## ..- attr(*, "opts")=List of 8
## .. ..$ zeta.omega : num 5.49e-06
## .. ..$ zeta.lambda : num 5.49e-06
## .. ..$ omega.intercept : logi FALSE
## .. ..$ lambda.intercept: logi TRUE
## .. ..$ update.omega : logi TRUE
## .. ..$ update.lambda : logi FALSE
## .. ..$ min.decrease : num 5.49e-05
## .. ..$ max.iter : num 10000
## $ sdid : 'synthdid_estimate' num [1, 1] -15.6
## ..- attr(*, "estimator")= chr "synthdid_estimate"
## ..- attr(*, "weights")=List of 5
## .. ..$ omega : num [1:38] 0 0.00344 0.05751 0.07829 0.07037 ...
## .. ..$ lambda : num [1:19] 0 0 0 0 0 0 0 0 0 0 ...
## .. ..$ lambda.vals: num [1:10000] 77.3 77.3 77.3 77.3 77.3 ...
## .. ..$ vals : num [1:10000] 90.5 89.8 89.5 89 88.9 ...
## .. ..$ omega.vals : num [1:10000] 13.2 12.5 12.1 11.7 11.6 ...
## ..- attr(*, "setup")=List of 4
## .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
## .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
## .. ..$ X : logi[1:39, 1:31, 0 ]
## .. ..$ N0: int 38
## .. ..$ T0: num 19
## ..- attr(*, "opts")=List of 8
## .. ..$ zeta.omega : num 10.2
## .. ..$ zeta.lambda : num 5.49e-06
## .. ..$ omega.intercept : logi TRUE
## .. ..$ lambda.intercept: logi TRUE
## .. ..$ update.omega : logi TRUE
## .. ..$ update.lambda : logi TRUE
## .. ..$ min.decrease : num 5.49e-05
## .. ..$ max.iter : num 10000
## $ difp : 'synthdid_estimate' num [1, 1] -11.1
## ..- attr(*, "estimator")= chr "synthdid_estimate"
## ..- attr(*, "weights")=List of 4
## .. ..$ lambda : num [1:19] 0.0526 0.0526 0.0526 0.0526 0.0526 ...
## .. ..$ omega : num [1:38] 0 0.0018 0.0969 0.2664 0 ...
## .. ..$ omega.vals: num [1:10000] 1.46 1.38 1.36 1.35 1.34 ...
## .. ..$ vals : num [1:10000] 1.46 1.38 1.36 1.35 1.34 ...
## ..- attr(*, "setup")=List of 4
## .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
## .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
## .. ..$ X : logi[1:39, 1:31, 0 ]
## .. ..$ N0: int 38
## .. ..$ T0: num 19
## ..- attr(*, "opts")=List of 8
## .. ..$ zeta.omega : num 5.49e-06
## .. ..$ zeta.lambda : num 5.49e-06
## .. ..$ omega.intercept : logi TRUE
## .. ..$ lambda.intercept: logi TRUE
## .. ..$ update.omega : logi TRUE
## .. ..$ update.lambda : logi FALSE
## .. ..$ min.decrease : num 5.49e-05
## .. ..$ max.iter : num 10000
## $ mc : num -20.2
## $ sc_reg : 'synthdid_estimate' num [1, 1] -21.7
## ..- attr(*, "estimator")= chr "sc_estimate"
## ..- attr(*, "weights")=List of 4
## .. ..$ lambda : num [1:19] 0 0 0 0 0 0 0 0 0 0 ...
## .. ..$ omega : num [1:38] 0 0 0.0857 0.0647 0 ...
## .. ..$ omega.vals: num [1:10000] 26.4 22.2 21.9 20.6 20.3 ...
## .. ..$ vals : num [1:10000] 26.4 22.2 21.9 20.6 20.3 ...
## ..- attr(*, "setup")=List of 4
## .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
## .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
## .. ..$ X : logi[1:39, 1:31, 0 ]
## .. ..$ N0: int 38
## .. ..$ T0: num 19
## ..- attr(*, "opts")=List of 8
## .. ..$ zeta.omega : num 10.2
## .. ..$ zeta.lambda : num 5.49e-06
## .. ..$ omega.intercept : logi FALSE
## .. ..$ lambda.intercept: logi TRUE
## .. ..$ update.omega : logi TRUE
## .. ..$ update.lambda : logi FALSE
## .. ..$ min.decrease : num 5.49e-05
## .. ..$ max.iter : num 10000
## $ difp_reg: 'synthdid_estimate' num [1, 1] -16.1
## ..- attr(*, "estimator")= chr "synthdid_estimate"
## ..- attr(*, "weights")=List of 4
## .. ..$ lambda : num [1:19] 0.0526 0.0526 0.0526 0.0526 0.0526 ...
## .. ..$ omega : num [1:38] 0 0.00344 0.05751 0.07829 0.07037 ...
## .. ..$ omega.vals: num [1:10000] 13.2 12.5 12.1 11.7 11.6 ...
## .. ..$ vals : num [1:10000] 13.2 12.5 12.1 11.7 11.6 ...
## ..- attr(*, "setup")=List of 4
## .. ..$ Y : num [1:39, 1:31] 89.8 100.3 124.8 120 155 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:39] "Alabama" "Arkansas" "Colorado" "Connecticut" ...
## .. .. .. ..$ : chr [1:31] "1970" "1971" "1972" "1973" ...
## .. ..$ X : logi[1:39, 1:31, 0 ]
## .. ..$ N0: int 38
## .. ..$ T0: num 19
## ..- attr(*, "opts")=List of 8
## .. ..$ zeta.omega : num 10.2
## .. ..$ zeta.lambda : num 5.49e-06
## .. ..$ omega.intercept : logi TRUE
## .. ..$ lambda.intercept: logi TRUE
## .. ..$ update.omega : logi TRUE
## .. ..$ update.lambda : logi FALSE
## .. ..$ min.decrease : num 5.49e-05
## .. ..$ max.iter : num 10000
synthdid_plot(estimates[1:3], facet.vertical = FALSE,
control.name = 'control', treated.name = 'california',
lambda.comparable = TRUE, se.method = 'none',
trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
trajectory.alpha = .7, effect.alpha = .7,
diagram.alpha = 1, onset.alpha = .7) +
theme(legend.position = c(.26, .07),
legend.direction = 'horizontal',
legend.key = element_blank(),
legend.background = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank())
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
theme(legend.background=element_blank(), legend.title = element_blank(),
legend.direction='horizontal', legend.position=c(.17,.07),
strip.background=element_blank(), strip.text.x = element_blank())
california.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california.table) = c('estimate', 'standard error')
colnames(california.table) = toupper(names(estimators))
round(california.table, digits=1)
## DID SC SDID DIFP MC SC_REG DIFP_REG
## estimate -27.3 -19.6 -15.6 -11.1 -20.2 -21.7 -16.1
## standard error 17.7 9.9 8.4 9.5 11.5 10.0 9.2
summary(california_prop99)
## State Year PacksPerCapita treated
## Alabama : 31 Min. :1970 Min. : 40.7 Min. :0.000000
## Arkansas : 31 1st Qu.:1977 1st Qu.:100.9 1st Qu.:0.000000
## California : 31 Median :1985 Median :116.3 Median :0.000000
## Colorado : 31 Mean :1985 Mean :118.9 Mean :0.009926
## Connecticut: 31 3rd Qu.:1993 3rd Qu.:130.5 3rd Qu.:0.000000
## Delaware : 31 Max. :2000 Max. :296.2 Max. :1.000000
## (Other) :1023
library(dplyr)
california_minus2 <- california_prop99 %>%
filter(! State %in% c("Nevada", "New Hampshire"))
data('california_minus2')
## Warning in data("california_minus2"): data set 'california_minus2' not found
setup <- synthdid::panel.matrices(california_minus2)
estimates <- lapply(estimators, function(estimator) {
estimator(setup$Y, setup$N0, setup$T0)
})
synthdid_plot(estimates[1:3], facet.vertical = FALSE,
control.name = 'control', treated.name = 'california',
lambda.comparable = TRUE, se.method = 'none',
trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
trajectory.alpha = .7, effect.alpha = .7,
diagram.alpha = 1, onset.alpha = .7) +
theme(legend.position = c(.26, .07),
legend.direction = 'horizontal',
legend.key = element_blank(),
legend.background = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank())
synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
theme(legend.background=element_blank(), legend.title = element_blank(),
legend.direction='horizontal', legend.position=c(.17,.07),
strip.background=element_blank(), strip.text.x = element_blank())
california2.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california2.table) = c('estimate', 'standard error')
colnames(california2.table) = toupper(names(estimators))
round(california2.table, digits=1)
## DID SC SDID DIFP MC SC_REG DIFP_REG
## estimate -30.1 -21.7 -19.5 -16.7 -24.0 -23.4 -20.6
## standard error 17.7 9.9 8.4 9.5 11.5 10.0 9.2
The point estimates are generally larger than before, aka more biased. Nevada and NH had larger weight in constructing the counterfactual previously, and eliminating them will reduce the “goodness of fit” of the synthetic California.
california3 <- california_prop99 %>%
filter(Year > 1974)
setup <- synthdid::panel.matrices(california3)
estimates <- lapply(estimators, function(estimator) {
estimator(setup$Y, setup$N0, setup$T0)
})
synthdid_plot(estimates[1:3], facet.vertical = FALSE,
control.name = 'control', treated.name = 'california',
lambda.comparable = TRUE, se.method = 'none',
trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
trajectory.alpha = .7, effect.alpha = .7,
diagram.alpha = 1, onset.alpha = .7) +
theme(legend.position = c(.26, .07),
legend.direction = 'horizontal',
legend.key = element_blank(),
legend.background = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank())
synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
theme(legend.background=element_blank(), legend.title = element_blank(),
legend.direction='horizontal', legend.position=c(.17,.07),
strip.background=element_blank(), strip.text.x = element_blank())
california3.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california3.table) = c('estimate', 'standard error')
colnames(california3.table) = toupper(names(estimators))
round(california3.table, digits=1)
## DID SC SDID DIFP MC SC_REG DIFP_REG
## estimate -23.7 -20.3 -17.9 -19.6 -20.6 -23.1 -18.4
## standard error 17.7 9.9 8.4 9.5 11.5 10.0 9.2
DID estimate becomes insignificant, and the other estimates are larger than before. This is reasonable given cig consumption show reverse trend (going from low to high) than the rest of the periods pre 1974 periods. The low values pulled the pre-treated Y down, so eliminating the portion allows a more linear fit of the trends and captures a larger TE.
california4 <- california_prop99 %>%
filter(Year < 1996)
setup <- synthdid::panel.matrices(california4)
estimates <- lapply(estimators, function(estimator) {
estimator(setup$Y, setup$N0, setup$T0)
})
synthdid_plot(estimates[1:3], facet.vertical = FALSE,
control.name = 'control', treated.name = 'california',
lambda.comparable = TRUE, se.method = 'none',
trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
trajectory.alpha = .7, effect.alpha = .7,
diagram.alpha = 1, onset.alpha = .7) +
theme(legend.position = c(.26, .07),
legend.direction = 'horizontal',
legend.key = element_blank(),
legend.background = element_blank(),
strip.background = element_blank(),
strip.text.x = element_blank())
synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
theme(legend.background=element_blank(), legend.title = element_blank(),
legend.direction='horizontal', legend.position=c(.17,.07),
strip.background=element_blank(), strip.text.x = element_blank())
california4.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california4.table) = c('estimate', 'standard error')
colnames(california4.table) = toupper(names(estimators))
round(california4.table, digits=1)
## DID SC SDID DIFP MC SC_REG DIFP_REG
## estimate -22.2 -15.3 -9.8 -8.0 -14.2 -17.1 -10.8
## standard error 17.7 9.9 8.4 9.5 11.5 10.0 9.2
Estimates become smaller insignificant, which makes sense given that the distance between treated and synthetic diverge after the treatment kicks in. Cutting off the treatment periods eliminates the most informative points with larger changes.
Research question: How did the surge in “tariffs” web search affect consumer price index in the US, Canada, and Mexico?
library(readxl)
data_monthly <- read_excel("Applied_Econometrics.xlsx", sheet = "Monthly")
data_quarterly <- read_excel("Applied_Econometrics.xlsx", sheet = "Quarterly")
## Merge monthly data with quarterly-assign quarterly value to corresponding month
library(dplyr)
library(tidyr)
library(lubridate)
monthly_data <- data_quarterly %>%
mutate(
quarter_start = floor_date(observation_date, unit = "quarter")
) %>%
rowwise() %>%
mutate(
months = list(seq(quarter_start, by = "1 month", length.out = 3))
) %>%
ungroup() %>%
select(-observation_date, -quarter_start) %>%
unnest(cols = months) %>%
rename(observation_date = months) %>%
select(observation_date, Rgdp_Canada, Rgdp_Mexico, Unemployment_mexico)
head(monthly_data)
## # A tibble: 6 × 4
## observation_date Rgdp_Canada Rgdp_Mexico Unemployment_mexico
## <dttm> <dbl> <dbl> <dbl>
## 1 2020-04-01 00:00:00 492032. 4797408. 4.92
## 2 2020-05-01 00:00:00 492032. 4797408. 4.92
## 3 2020-06-01 00:00:00 492032. 4797408. 4.92
## 4 2020-07-01 00:00:00 536725. 5538841 4.84
## 5 2020-08-01 00:00:00 536725. 5538841 4.84
## 6 2020-09-01 00:00:00 536725. 5538841 4.84
monthly_full <- monthly_data %>%
left_join(data_monthly, by = "observation_date") %>%
select(-Unemployment_MX)
head(monthly_full)
## # A tibble: 6 × 11
## observation_date Rgdp_Canada Rgdp_Mexico Unemployment_mexico CPI_Canada
## <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 2020-04-01 00:00:00 492032. 4797408. 4.92 107.
## 2 2020-05-01 00:00:00 492032. 4797408. 4.92 108.
## 3 2020-06-01 00:00:00 492032. 4797408. 4.92 108.
## 4 2020-07-01 00:00:00 536725. 5538841 4.84 108.
## 5 2020-08-01 00:00:00 536725. 5538841 4.84 108.
## 6 2020-09-01 00:00:00 536725. 5538841 4.84 108.
## # ℹ 6 more variables: CPI_US <dbl>, Income_US <dbl>, Unemployment_Canada <dbl>,
## # CPI_Mexico <dbl>, US_Consumption <dbl>, Unemployment_US <dbl>
search_US <- read.csv("USTimeline.csv") %>%
rename(searches = "Tariffs...United.States.") %>%
mutate(
searches = if_else(searches == "<1", "0", searches),
searches = as.numeric(searches)) %>%
mutate(Week = as.Date(Week)) %>%
mutate(observation_date = floor_date(Week, unit = "month")) %>%
group_by(observation_date) %>%
summarise(US_search = sum(searches, na.rm = TRUE))
head(search_US)
## # A tibble: 6 × 2
## observation_date US_search
## <date> <dbl>
## 1 2020-04-01 1
## 2 2020-05-01 2
## 3 2020-06-01 0
## 4 2020-07-01 0
## 5 2020-08-01 0
## 6 2020-09-01 4
search_Canada <- read.csv("CanadaTimeline.csv") %>%
rename(searches = "Tariffs...Canada.") %>%
mutate(
searches = if_else(searches == "<1", "0", searches),
searches = as.numeric(searches)) %>%
mutate(Week = as.Date(Week)) %>%
mutate(observation_date = floor_date(Week, unit = "month")) %>%
group_by(observation_date) %>%
summarise(Canada_search = sum(searches, na.rm = TRUE))
head(search_Canada)
## # A tibble: 6 × 2
## observation_date Canada_search
## <date> <dbl>
## 1 2020-04-01 0
## 2 2020-05-01 0
## 3 2020-06-01 0
## 4 2020-07-01 0
## 5 2020-08-01 3
## 6 2020-09-01 1
search_Mexico <- read.csv("MexicoTimeline.csv") %>%
rename(searches = "Tariffs...Mexico.") %>%
mutate(
searches = as.character(searches),
searches = if_else(searches == "<1", "0", searches),
searches = as.numeric(searches)) %>%
mutate(Week = as.Date(Week)) %>%
mutate(observation_date = floor_date(Week, unit = "month")) %>%
group_by(observation_date) %>%
summarise(Mexico_search = sum(searches, na.rm = TRUE))
head(search_Mexico)
## # A tibble: 6 × 2
## observation_date Mexico_search
## <date> <dbl>
## 1 2020-04-01 0
## 2 2020-05-01 0
## 3 2020-06-01 0
## 4 2020-07-01 0
## 5 2020-08-01 0
## 6 2020-09-01 0
full_data <- monthly_full %>%
left_join(search_Mexico, by = "observation_date") %>%
left_join(search_Canada, by = "observation_date") %>%
left_join(search_US, by = "observation_date")
head(full_data)
## # A tibble: 6 × 14
## observation_date Rgdp_Canada Rgdp_Mexico Unemployment_mexico CPI_Canada
## <dttm> <dbl> <dbl> <dbl> <dbl>
## 1 2020-04-01 00:00:00 492032. 4797408. 4.92 107.
## 2 2020-05-01 00:00:00 492032. 4797408. 4.92 108.
## 3 2020-06-01 00:00:00 492032. 4797408. 4.92 108.
## 4 2020-07-01 00:00:00 536725. 5538841 4.84 108.
## 5 2020-08-01 00:00:00 536725. 5538841 4.84 108.
## 6 2020-09-01 00:00:00 536725. 5538841 4.84 108.
## # ℹ 9 more variables: CPI_US <dbl>, Income_US <dbl>, Unemployment_Canada <dbl>,
## # CPI_Mexico <dbl>, US_Consumption <dbl>, Unemployment_US <dbl>,
## # Mexico_search <dbl>, Canada_search <dbl>, US_search <dbl>
#install.packages("zoo")
library(zoo)
# Assuming your data is merged and called `monthly_data`
reg_data <- full_data %>%
mutate(observation_date = ymd(as.character(observation_date))) %>%
mutate(
event_month = as.Date("2024-10-01"),
relative_time = as.integer((observation_date - event_month) / 30))
head(reg_data)
## # A tibble: 6 × 16
## observation_date Rgdp_Canada Rgdp_Mexico Unemployment_mexico CPI_Canada CPI_US
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-04-01 492032. 4797408. 4.92 107. 108.
## 2 2020-05-01 492032. 4797408. 4.92 108. 108.
## 3 2020-06-01 492032. 4797408. 4.92 108. 108.
## 4 2020-07-01 536725. 5538841 4.84 108. 109.
## 5 2020-08-01 536725. 5538841 4.84 108. 109.
## 6 2020-09-01 536725. 5538841 4.84 108. 110.
## # ℹ 10 more variables: Income_US <dbl>, Unemployment_Canada <dbl>,
## # CPI_Mexico <dbl>, US_Consumption <dbl>, Unemployment_US <dbl>,
## # Mexico_search <dbl>, Canada_search <dbl>, US_search <dbl>,
## # event_month <date>, relative_time <int>
library(dplyr)
library(lubridate)
# Set the event date: October 2024
event_date <- ymd("2024-10-01")
# Prepare your data, creating the 'post_event' variable
reg_data <- reg_data %>%
mutate(
date = ymd(observation_date), # Ensure the 'date' column is in Date format
post_event = if_else(observation_date >= event_date, 1, 0) # 1 if after event, 0 if before
)
library(fixest)
# Run the DID regression model
DID_model <- feols(
CPI_US ~ US_search * post_event + US_Consumption + Unemployment_US,
data = reg_data
)
# Summarize the results
summary(DID_model)
## OLS estimation, Dep. Var.: CPI_US
## Observations: 57
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.073424 3.637384 7.992950 1.4902e-10 ***
## US_search 0.013319 0.102086 0.130469 8.9671e-01
## post_event -2.127462 1.615286 -1.317081 1.9370e-01
## US_Consumption 0.005097 0.000178 28.681504 < 2.2e-16 ***
## Unemployment_US 0.821262 0.137465 5.974326 2.2501e-07 ***
## US_search:post_event -0.015629 0.129373 -0.120807 9.0432e-01
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 1.26919 Adj. R2: 0.975796
library(fixest)
US_model <- feols(
CPI_US ~ i(relative_time, ref = -1)*US_search + US_Consumption + Unemployment_US,
data = reg_data
)
## The variables 'US_search', 'Unemployment_US', 'US_search:relative_time::-54', 'US_search:relative_time::-53', 'US_search:relative_time::-52', 'US_search:relative_time::-51' and 52 others have been removed because of collinearity (see $collin.var).
summary(US_model)
## Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
## OLS estimation, Dep. Var.: CPI_US
## Observations: 57
## Standard-errors: IID
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 256.00 0.000010 2.560000e+07 2.4868e-08 ***
## relative_time::-54 -32.00 0.045685 -7.004438e+02 9.0888e-04 ***
## relative_time::-53 -40.00 0.048417 -8.261570e+02 7.7058e-04 ***
## relative_time::-52 -48.00 0.020484 -2.343287e+03 2.7168e-04 ***
## relative_time::-51 -32.00 0.027332 -1.170783e+03 5.4376e-04 ***
## relative_time::-50 -32.00 0.066064 -4.843758e+02 1.3143e-03 **
## relative_time::-49 -32.00 0.056811 -5.632675e+02 1.1302e-03 **
## relative_time::-48 -48.00 0.036301 -1.322262e+03 4.8146e-04 ***
## relative_time::-47 -64.00 0.055262 -1.158122e+03 5.4970e-04 ***
## relative_time::-46 -32.00 0.004757 -6.727498e+03 9.4630e-05 ***
## relative_time::-45 -16.00 0.013558 -1.180142e+03 5.3944e-04 ***
## relative_time::-44 -40.00 0.069664 -5.741869e+02 1.1087e-03 **
## relative_time::-43 -56.00 0.054304 -1.031222e+03 6.1734e-04 ***
## relative_time::-42 -32.00 0.054469 -5.874880e+02 1.0836e-03 **
## relative_time::-41 -48.00 0.039246 -1.223046e+03 5.2052e-04 ***
## relative_time::-40 -32.00 0.015460 -2.069913e+03 3.0756e-04 ***
## relative_time::-39 -48.00 0.008608 -5.576093e+03 1.1417e-04 ***
## relative_time::-38 -24.00 0.021466 -1.118044e+03 5.6940e-04 ***
## relative_time::-37 -32.00 0.006622 -4.832633e+03 1.3173e-04 ***
## relative_time::-36 -8.00 0.018480 -4.328999e+02 1.4706e-03 **
## relative_time::-35 -32.00 0.004786 -6.686039e+03 9.5216e-05 ***
## relative_time::-34 -16.00 0.114895 -1.392572e+02 4.5715e-03 **
## relative_time::-33 -24.00 0.073169 -3.280065e+02 1.9409e-03 **
## relative_time::-32 -28.00 0.084084 -3.330009e+02 1.9118e-03 **
## relative_time::-31 -16.00 0.012959 -1.234621e+03 5.1564e-04 ***
## relative_time::-30 -16.00 0.058659 -2.727613e+02 2.3340e-03 **
## relative_time::-29 -32.00 0.026110 -1.225600e+03 5.1944e-04 ***
## relative_time::-28 -28.00 0.016236 -1.724536e+03 3.6915e-04 ***
## relative_time::-27 -20.00 0.033208 -6.022620e+02 1.0570e-03 **
## relative_time::-26 -16.00 0.024596 -6.505215e+02 9.7863e-04 ***
## relative_time::-25 -16.00 0.011101 -1.441321e+03 4.4169e-04 ***
## relative_time::-24 -12.00 0.037774 -3.176789e+02 2.0040e-03 **
## relative_time::-23 -16.00 0.014990 -1.067345e+03 5.9645e-04 ***
## relative_time::-22 -8.00 0.050513 -1.583764e+02 4.0196e-03 **
## relative_time::-21 -14.00 0.022056 -6.347487e+02 1.0029e-03 **
## relative_time::-20 -18.00 0.027117 -6.637994e+02 9.5905e-04 ***
## relative_time::-19 -14.00 0.042488 -3.295040e+02 1.9320e-03 **
## relative_time::-18 -6.00 0.024212 -2.478091e+02 2.5690e-03 **
## relative_time::-17 -8.00 0.056730 -1.410183e+02 4.5144e-03 **
## relative_time::-16 -4.00 0.013915 -2.874650e+02 2.2146e-03 **
## relative_time::-15 -6.00 0.034634 -1.732397e+02 3.6747e-03 **
## relative_time::-14 -12.00 0.030003 -3.999653e+02 1.5917e-03 **
## relative_time::-13 -4.00 0.006861 -5.829862e+02 1.0920e-03 **
## relative_time::-12 -4.00 0.005416 -7.385876e+02 8.6194e-04 ***
## relative_time::-11 -7.00 0.006559 -1.067157e+03 5.9656e-04 ***
## relative_time::-10 -6.00 0.007932 -7.564634e+02 8.4157e-04 ***
## relative_time::-9 -3.00 0.019622 -1.528897e+02 4.1639e-03 **
## relative_time::-8 -6.00 0.000054 -1.105491e+05 5.7587e-06 ***
## relative_time::-7 -1.00 0.018653 -5.361132e+01 1.1873e-02 *
## relative_time::-6 -5.00 0.017828 -2.804526e+02 2.2700e-03 **
## relative_time::-5 -1.00 0.006592 -1.517038e+02 4.1964e-03 **
## relative_time::-4 -3.50 0.002980 -1.174520e+03 5.4203e-04 ***
## relative_time::-3 -0.25 0.009188 -2.720922e+01 2.3387e-02 *
## relative_time::-2 0.25 0.005507 4.539377e+01 1.4022e-02 *
## relative_time::0 0.50 0.006404 7.807192e+01 8.1538e-03 **
## relative_time::1 1.00 0.004197 2.382410e+02 2.6722e-03 **
## relative_time::2 2.00 0.004428 4.517188e+02 1.4093e-03 **
## US_Consumption 0.00 22.193701 0.000000e+00 1.0000e+00
## ... 58 variables were removed because of collinearity (US_search, Unemployment_US and 56 others [full set in $collin.var])
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 115.0 Adj. R2: 10,131.7
coef_data <- coef(US_model)
model_summary <- summary(US_model)
## Variance contained negative values in the diagonal and was 'fixed' (a la Cameron, Gelbach & Miller 2011).
#se_data <- model_summary$coefficients[, 2] # The standard errors are in the second column
coef_df <- data.frame(
relative_time = names(coef_data), # Coefficients are named by the terms in the model
coefficient = coef_data
# std_error = se_data,
# conf_lower = coef_data - 1.96 * se_data, # 95% confidence interval lower bound
# conf_upper = coef_data + 1.96 * se_data # 95% confidence interval upper bound
)
# Remove NA or non-numeric rows if any
coef_df <- na.omit(coef_df)
ggplot(coef_df, aes(x = relative_time, y = coefficient)) +
geom_line(color = "blue") +
geom_point(color = "red") +
theme_minimal() +
labs(
title = "Event Study: Impact of 'Tariffs Web Searches' on CPI",
x = "Relative Time",
y = "Effect on CPI",
subtitle = "Using Fixed-Effects Model"
) +
geom_vline(xintercept = 0, linetype = "dashed", color = "black")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
Does not have enough datapoints post event. Macro data not found for 2025.
Yes, it does. If the conditional mean and fall rank requirements are satisfied, OLS is the BEST estimator. The term I in the Ridge estimator itself introduces bias by nature, with >0, it shrinks the coefficient toward 0, causing bias.
housingdata <- readRDS("testdata20250121.RDS")
library(tidyr)
library(wooldridge)
## Warning: package 'wooldridge' was built under R version 4.4.3
##
## Attaching package: 'wooldridge'
## The following object is masked from 'package:MASS':
##
## cement
library(fixest)
library(multiwayvcov)
library(lmtest)
library(stargazer)
library(modelsummary)
library(causalweight)
library(fixest)
library(dplyr)
library(lubridate)
housingdata <- housingdata %>%
mutate(sale_date = ymd(as.character(sale_date)))
#Separate columns
housingdata <- housingdata %>%
mutate(
year = year(sale_date),
month = month(sale_date),
day = day(sale_date)
)
#Add log of land square footage
housingdata$log_land_square_footage <- log(housingdata$land_square_footage)
#Add LOG SALE AMOUNT
housingdata$log_sale_amount <- log(housingdata$sale_amount)
training <- subset(housingdata, !(abbr %in% c("NY", "CO", "CA")))
test <- subset(housingdata, abbr %in% c("NY", "CO", "CA"))
# Load necessary libraries
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-1
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Define response and predictors
response <- "log_sale_amount"
predictors <- c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
"number_of_fireplaces", "stories_number",
"log_land_square_footage", "AC_presence")
# Prepare data: training dataset
x_train <- as.matrix(as.data.frame(training)[, predictors])
y_train <- as.data.frame(training)[, response]
# Set seed for reproducibility
set.seed(123)
# Perform LASSO with 5-fold cross-validation
cv_lasso <- cv.glmnet(
x = x_train,
y = y_train,
alpha = 1, # alpha = 1 for LASSO
nfolds = 5,
type.measure = "mse"
)
# Plot cross-validation results
plot(cv_lasso)
# Best lambda (minimizing mse)
best_lambda <- cv_lasso$lambda.min
cat("Best lambda:", best_lambda, "\n")
## Best lambda: 0.001110023
# Coefficients of best model
coef(cv_lasso, s = "lambda.min")
## 8 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 5.261725555
## year_built 0.002818082
## bedrooms_all_buildings 0.025832984
## number_of_bathrooms 0.241310506
## number_of_fireplaces .
## stories_number 0.175381566
## log_land_square_footage 0.051093076
## AC_presence 0.025023535
y_mean <- mean(training$sale_amount)
print(y_mean)
## [1] 304779.4
The best lambda is 0.001110023. MWTP = 0.025*304779.4 = 7619.49.
# Filter test dataset: with AC and without AC
test_with_ac <- test[test$AC_presence == 1, ]
test_without_ac <- test[test$AC_presence == 0, ]
# Compute average values of predictors for both groups
avg_with_ac <- colMeans(test_with_ac[, ..predictors], na.rm = TRUE)
avg_without_ac <- colMeans(test_without_ac[, ..predictors], na.rm = TRUE)
# Retrieve LASSO coefficients (already fitted)
lasso_coef <- coef(cv_lasso, s = "lambda.min")
coef_names <- rownames(lasso_coef)
coef_values <- as.numeric(lasso_coef)
# Convert coefficients to named vector for easy lookup
beta <- setNames(coef_values, coef_names)
# Add intercept
intercept <- beta["(Intercept)"]
# Compute predicted log prices
pred_log_with_ac <- intercept + sum(beta[names(avg_with_ac)] * avg_with_ac)
pred_log_without_ac <- intercept + sum(beta[names(avg_without_ac)] * avg_without_ac)
# Convert from log price to price
pred_price_with_ac <- exp(pred_log_with_ac)
pred_price_without_ac <- exp(pred_log_without_ac)
# Compute difference in predicted price
price_diff <- pred_price_with_ac - pred_price_without_ac
# Output results
cat("Predicted price with AC:", round(pred_price_with_ac, 2), "\n")
## Predicted price with AC: 214771.4
cat("Predicted price without AC:", round(pred_price_without_ac, 2), "\n")
## Predicted price without AC: 193428.4
cat("Difference in predicted price:", round(price_diff, 2), "\n")
## Difference in predicted price: 21343
Difference in predicted house price is $21343.
obs_diff <- test %>%
group_by(AC_presence) %>%
summarise(mean_observed_price = mean(sale_amount, na.rm = TRUE)) %>%
summarise(observed_diff = diff(mean_observed_price)) # with AC - without AC
print(obs_diff)
## # A tibble: 1 × 1
## observed_diff
## <dbl>
## 1 -88846.
Observed difference in price = -88845.84, much larger than the LASSO predicted.
The values are not close as predicted.
# --------------------------
# New York
# --------------------------
ny_data <- test[abbr == "NY"]
ny_model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms +
number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
data = ny_data)
ny_ac_coef <- coef(ny_model)["AC_presence"]
ny_avg_price <- exp(mean(ny_data$log_sale_amount, na.rm = TRUE))
ny_mwtp <- ny_ac_coef * ny_avg_price
cat("\n--- New York ---\n")
##
## --- New York ---
cat("AC Coefficient (log):", round(ny_ac_coef, 4), "\n")
## AC Coefficient (log): 0.0895
cat("MWTP for AC ($):", round(ny_mwtp, 2), "\n")
## MWTP for AC ($): 45273.02
# --------------------------
# Colorado
# --------------------------
co_data <- test[abbr == "CO"]
co_model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms +
number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
data = co_data)
co_ac_coef <- coef(co_model)["AC_presence"]
co_avg_price <- exp(mean(co_data$log_sale_amount, na.rm = TRUE))
co_mwtp <- co_ac_coef * co_avg_price
cat("\n--- Colorado ---\n")
##
## --- Colorado ---
cat("AC Coefficient (log):", round(co_ac_coef, 4), "\n")
## AC Coefficient (log): 0.0603
cat("MWTP for AC ($):", round(co_mwtp, 2), "\n")
## MWTP for AC ($): 21814.07
# --------------------------
# California
# --------------------------
ca_data <- test[abbr == "CA"]
ca_model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms +
number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
data = ca_data)
ca_ac_coef <- coef(ca_model)["AC_presence"]
ca_avg_price <- exp(mean(ca_data$log_sale_amount, na.rm = TRUE))
ca_mwtp <- ca_ac_coef * ca_avg_price
cat("\n--- California ---\n")
##
## --- California ---
cat("AC Coefficient (log):", round(ca_ac_coef, 4), "\n")
## AC Coefficient (log): -0.3004
cat("MWTP for AC ($):", round(ca_mwtp, 2), "\n")
## MWTP for AC ($): -138097.2
The marginal willingness to pay is very different for all the states compared to the overall value in part c.
According to FWL theorem, both claims are correct.
install.packages("causaldrf")
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'causaldrf' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\zxl5576\AppData\Local\Temp\RtmpyO7CmZ\downloaded_packages
library(causalweight)
library(glmnet)
library(causaldrf)
## Warning: package 'causaldrf' was built under R version 4.4.3
CA_data <- test[abbr == "CA"]
# Define treatment variable (AC presence) and outcome variable (log_sale_amount)
treatment <- CA_data$AC_presence
outcome <- CA_data$log_sale_amount
# Define predictor columns in correct order
predictors <- c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
"number_of_fireplaces", "stories_number",
"log_land_square_footage", "AC_presence")
# Filter CA dataset: with AC and without AC
CA_data_with_ac <- CA_data[CA_data$AC_presence == 1, ]
CA_data_without_ac <- CA_data[CA_data$AC_presence == 0, ]
# Compute average values of predictors for both groups
avg_with_ac <- colMeans(CA_data_with_ac[, ..predictors], na.rm = TRUE)
avg_without_ac <- colMeans(CA_data_without_ac[, ..predictors], na.rm = TRUE)
# Retrieve LASSO coefficients (already fitted)
lasso_coef <- coef(cv_lasso, s = "lambda.min")
coef_names <- rownames(lasso_coef)
coef_values <- as.numeric(lasso_coef)
# Convert coefficients to named vector for easy lookup
beta <- setNames(coef_values, coef_names)
# Add intercept (should match the intercept in the LASSO model)
intercept <- beta["(Intercept)"]
# Compute predicted log prices for homes with and without AC
pred_log_with_ac <- intercept + sum(beta[names(avg_with_ac)] * avg_with_ac)
pred_log_without_ac <- intercept + sum(beta[names(avg_without_ac)] * avg_without_ac)
# Convert from log price to actual price
pred_price_with_ac <- exp(pred_log_with_ac)
pred_price_without_ac <- exp(pred_log_without_ac)
# Compute difference in predicted prices
price_diff <- pred_price_with_ac - pred_price_without_ac
# Output the results
cat("Predicted price with AC:", round(pred_price_with_ac, 2), "\n")
## Predicted price with AC: 205505.3
cat("Predicted price without AC:", round(pred_price_without_ac, 2), "\n")
## Predicted price without AC: 177486.3
cat("Difference in predicted price:", round(price_diff, 2), "\n")
## Difference in predicted price: 28019.02
outcome <- "log_sale_amount" # Example outcome variable
treatment <- "AC_presence" # Treatment variable (whether the house has AC)
covariates <- c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
"number_of_fireplaces", "stories_number",
"log_land_square_footage") # List of covariates to control for
# Prepare the data (CA_data is your dataset)
X <- as.matrix(CA_data[, covariates, with = FALSE]) # Covariates matrix
Y <- CA_data[[outcome]] # Outcome variable
D <- CA_data[[treatment]] # Treatment variable
# Set the seed for reproducibility
set.seed(123)
# Apply Double Machine Learning (DML) using LASSO
dml_model <- treatDML(y = Y,
d = D,
x = X,
MLmethod = "lasso",
k = 3) # 3-fold cross-validation
## Loading required package: nnls
# Extract the marginal willingness to pay (MWTP) estimate
mwtp <- dml_model$theta
#cat("Marginal Willingness to Pay (MWTP) for AC:", round(mwtp, 4), "\n")