Setting Working Directory

setwd("C:/Users/Lenovo/OneDrive - The Pennsylvania State University/Penn State/Coursework/Sem IV/EEFE 530/Problem Sets/Problem Set 6")

#Libraries

# Load packages
library(pacman)
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, 
       cobalt, Matchit, Zelig, estimatr, cem, tidyverse, 
       lubridate, usmap, gridExtra, stringr, readxl, plot3D,  
       cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,  
       foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, 
       kableExtra, snakecase, janitor)
## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
library(wooldridge)
## 
## Attaching package: 'wooldridge'
## The following object is masked from 'package:MASS':
## 
##     cement
library(fixest)
## 
## Attaching package: 'fixest'
## The following object is masked from 'package:scales':
## 
##     pvalue
## The following object is masked from 'package:Jmisc':
## 
##     demean
library(multiwayvcov)
library(lmtest)
library(stargazer)
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
library(causalweight)
## Loading required package: ranger
library(fixest)

PROBLEM 1

#Load Data
install.packages('devtools')
## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'devtools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Lenovo\AppData\Local\Temp\RtmpuUZaKE\downloaded_packages
devtools::install_github('synth-inference/synthdid', ref='sdid-paper')
## Skipping install of 'synthdid' from a github remote, the SHA1 (36923192) has not changed since last install.
##   Use `force = TRUE` to force installation
install.packages(c('dplyr', 'tidyr', 'tibble', 'ggplot2', 'devtools', 'xtable'))
## Warning: packages 'dplyr', 'tidyr', 'tibble', 'ggplot2' are in use and will not
## be installed
## Installing packages into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'devtools' successfully unpacked and MD5 sums checked
## package 'xtable' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Lenovo\AppData\Local\Temp\RtmpuUZaKE\downloaded_packages
#devtools::install_github('cran/glmnet',              ref='f4fc95ab49efaad9b6e1728a7c840bc6159501dc')

devtools::install_github('susanathey/MCPanel',       ref='6b2706fd7c35f3266048ceb22a7e9a61ae1774da')
## Skipping install of 'MCPanel' from a github remote, the SHA1 (6b2706fd) has not changed since last install.
##   Use `force = TRUE` to force installation
library(synthdid)
library(ggplot2)
set.seed(12345)

Part i. Partial replication

(a) Replicate Table 1 from Arkhangelsky et al., 2021

Call Data and test the estimate

data('california_prop99')
setup = panel.matrices(california_prop99)
tau.hat = synthdid_estimate(setup$Y, setup$N0, setup$T0)

tau.hat
## synthdid: -15.604 +- NA. Effective N0/N0 = 16.4/38~0.4. Effective T0/T0 = 2.8/19~0.1. N1,T1 = 1,12.
print(summary(tau.hat))
## $estimate
## [1] -15.60383
## 
## $se
##      [,1]
## [1,]   NA
## 
## $controls
##                estimate 1
## Nevada              0.124
## New Hampshire       0.105
## Connecticut         0.078
## Delaware            0.070
## Colorado            0.058
## Illinois            0.053
## Nebraska            0.048
## Montana             0.045
## Utah                0.042
## New Mexico          0.041
## Minnesota           0.039
## Wisconsin           0.037
## West Virginia       0.034
## North Carolina      0.033
## Idaho               0.031
## Ohio                0.031
## Maine               0.028
## Iowa                0.026
## 
## $periods
##      estimate 1
## 1988      0.427
## 1986      0.366
## 1987      0.206
## 
## $dimensions
##           N1           N0 N0.effective           T1           T0 T0.effective 
##        1.000       38.000       16.388       12.000       19.000        2.783
se = sqrt(vcov(tau.hat, method='placebo'))
sprintf('point estimate: %1.2f', tau.hat)
## [1] "point estimate: -15.60"
sprintf('95%% CI (%1.2f, %1.2f)', tau.hat - 1.96 * se, tau.hat + 1.96 * se)
## [1] "95% CI (-32.01, 0.80)"

Create parallel cluster for heavy computation

library(parallel)
library(doSNOW)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: snow
## 
## Attaching package: 'snow'
## The following objects are masked from 'package:parallel':
## 
##     closeNode, clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, clusterSplit, makeCluster, parApply,
##     parCapply, parLapply, parRapply, parSapply, recvData, recvOneData,
##     sendData, splitIndices, stopCluster
library(foreach)

#For quicker processing
paraprocess <- 1
if (paraprocess == 1) {
  availcores <- detectCores()
  numcores   <- max(1, floor(availcores / 1.2))
  cl <- makeCluster(numcores, type = "SOCK")
  registerDoSNOW(cl)
  clusterSetRNGStream(cl, 12345)
}

Derive estimates

library(MCPanel)
did_estimate = function(Y, N0, T0) {
  synthdid::did_estimate(Y, N0, T0)
}

sc_estimate_reg = function(Y, N0, T0) {
  synthdid::sc_estimate(Y, N0, T0, eta.omega = ((nrow(Y) - N0) * (ncol(Y) - T0))^(1/4))
}

sdid_estimate = function(Y, N0, T0) {
  synthdid::synthdid_estimate(Y, N0, T0)
}

mc_estimate = function(Y, N0, T0) {
    N1=nrow(Y)-N0
    T1=ncol(Y)-T0
    W <- outer(c(rep(0,N0),rep(1,N1)),c(rep(0,T0),rep(1,T1)))
    mc_pred <- mcnnm_cv(Y, 1-W, num_lam_L = 20)
    mc_fit  <- mc_pred$L + outer(mc_pred$u, mc_pred$v, '+')
    mc_est <- sum(W*(Y-mc_fit))/sum(W)
    mc_est
}

mc_placebo_se_parallel = function(Y, N0, T0, replications = 200) {
  N1 <- nrow(Y) - N0
  theta <- function(ind) {
    mc_estimate(Y[ind, ], length(ind) - N1, T0)
  }
  placebo_ests <- foreach(i = 1:replications, .combine = c) %dopar% {
    theta(sample(1:N0))
  }
  sqrt((replications - 1) / replications) * sd(placebo_ests)
}              

difp_estimate = function(Y, N0, T0) {
    synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)), eta.omega=1e-6)
}


difp_estimate_reg = function(Y, N0, T0) {
    synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)))
}

#save all estimates
estimators = list(did=did_estimate,
                  sc=sc_estimate,
                  sdid=synthdid_estimate,
                  difp=difp_estimate,
                  mc = mc_estimate,
                  sc_reg = sc_estimate_reg,
                  difp_reg = difp_estimate_reg)

Create a table with estimates and standard errors

library(ggplot2)
library(synthdid)

set.seed(12345)
data('california_prop99')
setup <- synthdid::panel.matrices(california_prop99)

estimates <- lapply(estimators, function(estimator) {
  estimator(setup$Y, setup$N0, setup$T0)
})

# Extract point estimates
point_estimates <- sapply(estimates, function(x) as.numeric(x))

# Extract standard errors (placebo method)
standard_errors <- sapply(estimates, function(x) {
  if (inherits(x, "synthdid_estimate")) {
    sqrt(vcov(x, method = "placebo"))
  } else {
    NA 
  }
})

# Select only SDID, SC, DID, 
methods_to_keep <- c("sdid", "sc", "did")
filtered_estimates <- point_estimates[methods_to_keep]
filtered_ses <- standard_errors[methods_to_keep]

# Create a nice 2-row data.frame
final_table <- rbind(
  Estimate = round(filtered_estimates, 1),
  `Standard Error` = paste0("(", round(filtered_ses, 1), ")")
)

#Rename columns
colnames(final_table) <- toupper(methods_to_keep)

# Print final table
cat("Table 1\n\n")
## Table 1
print(final_table, quote = FALSE)
##                SDID  SC     DID   
## Estimate       -15.6 -19.6  -27.3 
## Standard Error (9.9) (10.9) (17.7)

(b) Replicate Figure 1

Plot 1: Creating plot for consumption in California in pre and post treatment periods

synthdid_plot(estimates[1:3], facet.vertical = FALSE,
              control.name = 'control', treated.name = 'california',
              lambda.comparable = TRUE, se.method = 'none',
              trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
              trajectory.alpha = .7, effect.alpha = .7,
              diagram.alpha = 1, onset.alpha = .7) +
    labs(title = "Figure 1(a)") +
  theme(legend.position = c(.26, .07), 
        legend.direction = 'horizontal',
        legend.key = element_blank(), 
        legend.background = element_blank(),
        strip.background = element_blank(), 
        strip.text.x = element_blank())
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Plot 2: Difference in consumption in states based on different methods

synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
  labs(title = "Figure 1(b)") +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

Part ii - Extensions

(a) Restricted donor pool

Subset the data

library(dplyr)
california_restricted <- california_prop99 %>%
  filter(! State %in% c("Nevada", "New Hampshire"))

fwrite(california_restricted, "california_restricted.csv")

Estimation

library(synthdid)
setup = synthdid::panel.matrices(california_restricted)

estimates <- lapply(estimators, function(estimator) {
  estimator(setup$Y, setup$N0, setup$T0)
})

# Extract point estimates
point_estimates <- sapply(estimates, function(x) as.numeric(x))

# Extract standard errors (placebo method)
standard_errors <- sapply(estimates, function(x) {
  if (inherits(x, "synthdid_estimate")) {
    sqrt(vcov(x, method = "placebo"))
  } else {
    NA  # mark MC (and others without SE) as NA
  }
})

# Select only SDID, SC, DID, 
methods_to_keep <- c("sdid", "sc", "did")
filtered_estimates <- point_estimates[methods_to_keep]
filtered_ses <- standard_errors[methods_to_keep]

# Create a nice 2-row data.frame
final_table <- rbind(
  Estimate = round(filtered_estimates, 1),
  `Standard Error` = paste0("(", round(filtered_ses, 1), ")")
)

#Rename columns
colnames(final_table) <- toupper(methods_to_keep)

# Print final table
cat("Table 2\n\n")
## Table 2
print(final_table, quote = FALSE)
##                SDID  SC     DID   
## Estimate       -19.5 -21.7  -30.1 
## Standard Error (8.7) (12.8) (12.7)

Plot 1

synthdid_plot(estimates[1:3], facet.vertical = FALSE,
              control.name = 'control', treated.name = 'california',
              lambda.comparable = TRUE, se.method = 'none',
              trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
              trajectory.alpha = .7, effect.alpha = .7,
              diagram.alpha = 1, onset.alpha = .7) +
    labs(title = "Figure 2(a)") +
  theme(legend.position = c(.26, .07), 
        legend.direction = 'horizontal',
        legend.key = element_blank(), 
        legend.background = element_blank(),
        strip.background = element_blank(), 
        strip.text.x = element_blank())

Plot 2

synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
  labs(title = "Figure 2(b)") +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

The estimates are more negative than the previous results and standard errors have increased for the increased for SDID and SC but reduced for DID. Thus, DID estimate becomes significant as opposed to previous results. Reducing the dataset by two states loses important points to get meaningful results.

However, SDID shows the most stable distribution of weights and low variance.

(b) Restricted pre-treatment periods

Subset post data only

california_not_pre <- california_prop99 %>%
  filter(Year > 1974)

Run setup and get estimates

setup <- synthdid::panel.matrices(california_not_pre)

estimates <- lapply(estimators, function(estimator) {
  estimator(setup$Y, setup$N0, setup$T0)
})

# Extract point estimates
point_estimates <- sapply(estimates, function(x) as.numeric(x))

# Extract standard errors (placebo method)
standard_errors <- sapply(estimates, function(x) {
  if (inherits(x, "synthdid_estimate")) {
    sqrt(vcov(x, method = "placebo"))
  } else {
    NA  # mark MC (and others without SE) as NA
  }
})

# Select only SDID, SC, DID, 
methods_to_keep <- c("sdid", "sc", "did")
filtered_estimates <- point_estimates[methods_to_keep]
filtered_ses <- standard_errors[methods_to_keep]

# Create a nice 2-row data.frame
final_table <- rbind(
  Estimate = round(filtered_estimates, 1),
  `Standard Error` = paste0("(", round(filtered_ses, 1), ")")
)

#Rename columns
colnames(final_table) <- toupper(methods_to_keep)

# Print final table
cat("Table 3\n\n")
## Table 3
print(final_table, quote = FALSE)
##                SDID  SC     DID   
## Estimate       -17.9 -20.3  -23.7 
## Standard Error (9.2) (10.2) (13.8)

Plot 1

synthdid_plot(estimates[1:3], facet.vertical = FALSE,
              control.name = 'control', treated.name = 'california',
              lambda.comparable = TRUE, se.method = 'none',
              trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
              trajectory.alpha = .7, effect.alpha = .7,
              diagram.alpha = 1, onset.alpha = .7) +
  labs(title = "Figure 3(a)") +
  theme(legend.position = c(.26, .07), 
        legend.direction = 'horizontal',
        legend.key = element_blank(), 
        legend.background = element_blank(),
        strip.background = element_blank(), 
        strip.text.x = element_blank())

Plot 2

synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
  labs(title = "Figure 3(b)") +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

The estimates increase than the original replication in all estimators. The SC estimate becomes significant in this run while others remain insignificant. If we remove the pre-treatment period, we are only estimating post period effect which will be larger and insignificant as there are no controls.

(c) Restricted post-treatment periods

Subset the data

california_not_post <- california_prop99 %>%
  filter(Year < 1996)

Derive estimates

setup <- synthdid::panel.matrices(california_not_post)

estimates <- lapply(estimators, function(estimator) {
  estimator(setup$Y, setup$N0, setup$T0)
})

# Extract point estimates
point_estimates <- sapply(estimates, function(x) as.numeric(x))

# Extract standard errors (placebo method)
standard_errors <- sapply(estimates, function(x) {
  if (inherits(x, "synthdid_estimate")) {
    sqrt(vcov(x, method = "placebo"))
  } else {
    NA  # mark MC (and others without SE) as NA
  }
})

# Select only SDID, SC, DID, 
methods_to_keep <- c("sdid", "sc", "did")
filtered_estimates <- point_estimates[methods_to_keep]
filtered_ses <- standard_errors[methods_to_keep]

# Create a nice 2-row data.frame
final_table <- rbind(
  Estimate = round(filtered_estimates, 1),
  `Standard Error` = paste0("(", round(filtered_ses, 1), ")")
)

#Rename columns
colnames(final_table) <- toupper(methods_to_keep)

# Print final table
cat("Table 4\n\n")
## Table 4
print(final_table, quote = FALSE)
##                SDID  SC    DID   
## Estimate       -9.8  -15.3 -22.2 
## Standard Error (7.5) (9.5) (16.6)

Plot 1

synthdid_plot(estimates[1:3], facet.vertical = FALSE,
              control.name = 'control', treated.name = 'california',
              lambda.comparable = TRUE, se.method = 'none',
              trajectory.linetype = 1, line.width = .75, effect.curvature = -.4,
              trajectory.alpha = .7, effect.alpha = .7,
              diagram.alpha = 1, onset.alpha = .7) +
  labs(title = "Figure 4(a)") +
  theme(legend.position = c(.26, .07), 
        legend.direction = 'horizontal',
        legend.key = element_blank(), 
        legend.background = element_blank(),
        strip.background = element_blank(), 
        strip.text.x = element_blank())

Plot 2

synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
  labs(title = "Figure 4(b)") +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

Estimates become smaller insignificant, other than the SC, since we eliminate entire treatment period and the results pre-treatment should be insignificant. Eliminating post treatment data removes any information of larger and significant changes.

PROBLEM 3: MACHINE LEARNING

Part i. Ridge Regression

If Y = X’beta + U - (1) where E[U|X] = 0, and we have beta_R = (X’X + lambda*I)^-1X’Y - (2)

Adding expectation it gives:

E[beta_R|X] = E[(X’X + lambda*I)^-1X’Y|X]

Replace Y in equation 2 with equation 1,

E[beta_R|X] = E[(X’X + lambda*I)^-1X’(X’beta + U)|X]

Since E[U|X] = 0, the above equation solves to:

E[beta_R|X] = (X’X + lambda*I)^-1(X’X)beta - (3)

The unbiased estimator of beta_R should be such that E(beta_Rhat|X) = beta, but here we have

E[beta_R|X] = (X’X + lambda*I)^-1(X’X)beta

Since lambda > 0, (X’X + lambda*I) is invertible but different from (X’X),

therefore, E(beta_Rhat|X) =/= beta

Thus, ridge estimator is biased when lambda > 0.

Part ii. LASSO

#Load Data
housingdata <- readRDS("testdata20250121.RDS")

(a) Test and Training Datasets

#Clean Dataset
#Change the string to YYY-MM-DD format
housingdata <- housingdata %>%
  mutate(sale_date = ymd(sale_date)) 

#Separate columns
housingdata <- housingdata %>%
  mutate(
    year = year(sale_date),
    month = month(sale_date),
    day = day(sale_date)
  )

#Add log of land square footage
housingdata$log_land_square_footage <- log(housingdata$land_square_footage)

#Add LOG SALE AMOUNT
housingdata$log_sale_amount <- log(housingdata$sale_amount)

Creating test and training datasets

training <- subset(housingdata, !(abbr %in% c("NY", "CO", "CA")))
test <- subset(housingdata, abbr %in% c("NY", "CO", "CA"))

(b) Estimate hedonic price model

# Load necessary libraries
install.packages("glmnet")
## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## package 'glmnet' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'glmnet'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Lenovo\AppData\Local\R\win-library\4.4\00LOCK\glmnet\libs\x64\glmnet.dll
## to C:\Users\Lenovo\AppData\Local\R\win-library\4.4\glmnet\libs\x64\glmnet.dll:
## Permission denied
## Warning: restored 'glmnet'
## 
## The downloaded binary packages are in
##  C:\Users\Lenovo\AppData\Local\Temp\RtmpuUZaKE\downloaded_packages
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# Define response and predictors
response <- "log_sale_amount"
predictors <- c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
                "number_of_fireplaces", "stories_number", 
                "log_land_square_footage", "AC_presence")

# Prepare data: training dataset
x_train <- as.matrix(as.data.frame(training)[, predictors])
y_train <- as.data.frame(training)[, response]

# Set seed for reproducibility
set.seed(123)

# Perform LASSO with 5-fold cross-validation
cv_lasso <- cv.glmnet(
  x = x_train,
  y = y_train,
  alpha = 1,                 # alpha = 1 for LASSO
  nfolds = 5,
  type.measure = "mse"
)

# Plot cross-validation results
plot(cv_lasso)

# Best lambda (minimizing mse)
best_lambda <- cv_lasso$lambda.min
cat("Best lambda:", best_lambda, "\n")
## Best lambda: 0.001110023

(c) Best Lambda

The best lambda is 0.001110023.

# Coefficients of best model
coef(cv_lasso, s = "lambda.min")
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                                  s1
## (Intercept)             5.261725554
## year_built              0.002818082
## bedrooms_all_buildings  0.025832984
## number_of_bathrooms     0.241310506
## number_of_fireplaces    .          
## stories_number          0.175381566
## log_land_square_footage 0.051093076
## AC_presence             0.025023535

According to the coefficient for AC, having AC increases house sale prices by approximately 2.5%. The marginal willingness to pay (MWTP) is coefficient*average sale price:

# Calculate MWTP
mean_log_price <- mean(training$log_sale_amount, na.rm = TRUE)
mean_price <- exp(mean_log_price)
MWTP_AC <- 0.0250 * mean_price
MWTP_AC
## [1] 5498.136

Therefore, MWTP for AC is ~$5500.

(d) Predicted average house price with or without AC

# Define predictor columns in correct order
predictors <- c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
                "number_of_fireplaces", "stories_number", 
                "log_land_square_footage", "AC_presence")

# Filter test dataset: with AC and without AC
test_with_ac <- test[test$AC_presence == 1, ]
test_without_ac <- test[test$AC_presence == 0, ]

# Compute average values of predictors for both groups
avg_with_ac <- colMeans(test_with_ac[, ..predictors], na.rm = TRUE)
avg_without_ac <- colMeans(test_without_ac[, ..predictors], na.rm = TRUE)

# Retrieve LASSO coefficients (already fitted)
lasso_coef <- coef(cv_lasso, s = "lambda.min")
coef_names <- rownames(lasso_coef)
coef_values <- as.numeric(lasso_coef)

# Convert coefficients to named vector for easy lookup
beta <- setNames(coef_values, coef_names)

# Add intercept
intercept <- beta["(Intercept)"]

# Compute predicted log prices
pred_log_with_ac <- intercept + sum(beta[names(avg_with_ac)] * avg_with_ac)
pred_log_without_ac <- intercept + sum(beta[names(avg_without_ac)] * avg_without_ac)

# Convert from log price to price
pred_price_with_ac <- exp(pred_log_with_ac)
pred_price_without_ac <- exp(pred_log_without_ac)

# Compute difference in predicted price
price_diff <- pred_price_with_ac - pred_price_without_ac

# Output results
cat("Predicted price with AC:", round(pred_price_with_ac, 2), "\n")
## Predicted price with AC: 214771.4
cat("Predicted price without AC:", round(pred_price_without_ac, 2), "\n")
## Predicted price without AC: 193428.4
cat("Difference in predicted price:", round(price_diff, 2), "\n")
## Difference in predicted price: 21343

Difference in predicted house price is $21,343.

(e) Difference in Observed Housing Prices

# Calculate observed average log sale amount in the test dataset
avg_log_price_with_ac <- mean(test$log_sale_amount[test$AC_presence == 1], na.rm = TRUE)
avg_log_price_without_ac <- mean(test$log_sale_amount[test$AC_presence == 0], na.rm = TRUE)

# Convert to actual price (exponentiate)
obs_price_with_ac <- exp(avg_log_price_with_ac)
obs_price_without_ac <- exp(avg_log_price_without_ac)

# Observed price difference
obs_price_diff <- obs_price_with_ac - obs_price_without_ac

# Print results
cat("Observed average price with AC:", round(obs_price_with_ac, 2), "\n")
## Observed average price with AC: 390327.3
cat("Observed average price without AC:", round(obs_price_without_ac, 2), "\n")
## Observed average price without AC: 472704.5
cat("Observed difference in price:", round(obs_price_diff, 2), "\n")
## Observed difference in price: -82377.15

In the predicted set up, prices with AC were higher, but here it is lower. In actual test observed dataset, the values are $82,800 lower for houses with AC.

The values are not close as predicted.

(f) MWTP in NY, CO, CA

# --------------------------
# New York
# --------------------------
ny_data <- test[abbr == "NY"]

ny_model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms +
               number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
               data = ny_data)

ny_ac_coef <- coef(ny_model)["AC_presence"]
ny_avg_price <- exp(mean(ny_data$log_sale_amount, na.rm = TRUE))
ny_mwtp <- ny_ac_coef * ny_avg_price

cat("\n--- New York ---\n")
## 
## --- New York ---
cat("AC Coefficient (log):", round(ny_ac_coef, 4), "\n")
## AC Coefficient (log): 0.0895
cat("MWTP for AC ($):", round(ny_mwtp, 2), "\n")
## MWTP for AC ($): 45273.02
# --------------------------
# Colorado
# --------------------------
co_data <- test[abbr == "CO"]

co_model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms +
               number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
               data = co_data)

co_ac_coef <- coef(co_model)["AC_presence"]
co_avg_price <- exp(mean(co_data$log_sale_amount, na.rm = TRUE))
co_mwtp <- co_ac_coef * co_avg_price

cat("\n--- Colorado ---\n")
## 
## --- Colorado ---
cat("AC Coefficient (log):", round(co_ac_coef, 4), "\n")
## AC Coefficient (log): 0.0603
cat("MWTP for AC ($):", round(co_mwtp, 2), "\n")
## MWTP for AC ($): 21814.07
# --------------------------
# California
# --------------------------
ca_data <- test[abbr == "CA"]

ca_model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms +
               number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
               data = ca_data)

ca_ac_coef <- coef(ca_model)["AC_presence"]
ca_avg_price <- exp(mean(ca_data$log_sale_amount, na.rm = TRUE))
ca_mwtp <- ca_ac_coef * ca_avg_price

cat("\n--- California ---\n")
## 
## --- California ---
cat("AC Coefficient (log):", round(ca_ac_coef, 4), "\n")
## AC Coefficient (log): -0.3004
cat("MWTP for AC ($):", round(ca_mwtp, 2), "\n")
## MWTP for AC ($): -138097.2

The marginal willingness to pay is very different for all the states compared to the overall value in part c. Thus, results are not similar at all.

Part iii. Partialling Out

According to FWL theorem, in a problem set up such as in Equation 1, we regress X1 on Y and save their residuals - mu_hat, and X2 on X1 to get beta2_hat, and finally we regress beta2_hat on mu_hat to get the final estimation. According to this logic, Alternative 1 suggested by the friend is the more appropriate way. The alternative 2 does not partial out X2 from X1, thus, will not take out the X1-X2 correlation.

Therefore, their claim 1 is true and claim 2 is false.

Part iv. Debiased ML

(a) treatDML

# Load necessary package
library(causalweight)

# Subset to California
ca_data <- test[abbr == "CA"]

ca_data <- as.data.frame(ca_data)


# Define outcome, treatment, and covariates
y_var <- "log_sale_amount"
d_var <- "AC_presence"
x_vars <- c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
            "number_of_fireplaces", "stories_number", "log_land_square_footage")
x_mat <- ca_data[, x_vars]


# Run treatDML with LASSO and 3-fold cross-fitting
set.seed(123)
system.time({
  dml_result <- treatDML(
    y = ca_data[[y_var]],
    d = ca_data[[d_var]],
    x = x_mat,
    k = 3
  )
})
## Loading required package: nnls
##    user  system elapsed 
## 3964.03  116.69 4086.70
# Print the result
summary(dml_result)
##             Length Class  Mode   
## effect           1 -none- numeric
## se               1 -none- numeric
## pval             1 -none- numeric
## ntrimmed         1 -none- numeric
## meantreat        1 -none- numeric
## meancontrol      1 -none- numeric
## pstreat     763102 -none- numeric
## pscontrol   763102 -none- numeric

#Alternative Solution

# Load required packages
library(causalweight)
library(glmnet)
library(dplyr)

# 1. Filter California test data
ca_test <- test %>% 
  filter(abbr == "CA")  # California subset

x_matrix <- ca_test %>%
  dplyr::select(year_built, bedrooms_all_buildings, number_of_bathrooms,
         number_of_fireplaces, stories_number, log_land_square_footage) %>%
  as.matrix()

# 3. Apply Double Machine Learning with LASSO
set.seed(123)  # For reproducibility
dml_result <- treatDML(
  y = ca_test$log_sale_amount,                     # Outcome variable
  d = ca_test$AC_presence,                    # Treatment variable (AC)
  x = x_matrix,
  MLmethod = "lasso",                   # ML method (LASSO)
  k = 3,                             # 3-fold cross-fitting
  trim = 0.1,                        # Trimming threshold (optional)
  normalized = TRUE                  # Standardize variables
)

# 4. Extract treatment effect (AC coefficient)
ac_effect <- dml_result$eff
cat("Estimated treatment effect of AC:", ac_effect, "\n")
## Estimated treatment effect of AC: -0.2982498
# 5. Avg sales price
avg_sale_test <- mean(ca_test$log_sale_amount)
avg_sale_test <- exp(avg_sale_test)

# 6. MWTP for AC
ca_test_mwtp <- avg_sale_test*ac_effect
ca_test_mwtp
## [1] -137108

The MWTP reduces by 137,108 when houses have AC.

PROBLEM 4

Part i. Conventional and Robust RDD (Sharp design)

(a) Plot

library(rdrobust)
## Warning: package 'rdrobust' was built under R version 4.4.3
# Load the data
data("rdrobust_RDsenate")

# Quick scatter plot to visualize outcome vs running variable
plot(rdrobust_RDsenate$margin, rdrobust_RDsenate$vote,
     xlab = "Democratic Margin of Victory (Previous Election)",
     ylab = "Democratic Vote Share (Next Election)",
     main = "Outcome vs Running Variable",
     pch = 20, col = "blue")

# Add vertical line at margin = 0 (the cutoff)
abline(v = 0, col = "red", lwd = 2, lty = 2)

(b) Optimal Bandwidth

Using “rdrobust”

# Load necessary library
library(rdrobust)

# Run the rdrobust estimation
rdr_result <- rdrobust(
  y = rdrobust_RDsenate$vote,      # Outcome variable
  x = rdrobust_RDsenate$margin,    # Running variable
  c = 0                            # Cutoff at 0
)

# Print full output
summary(rdr_result)
## Sharp RD estimates using local polynomial regression.
## 
## Number of Obs.                 1297
## BW type                       mserd
## Kernel                   Triangular
## VCE method                       NN
## 
## Number of Obs.                  595          702
## Eff. Number of Obs.             360          323
## Order est. (p)                    1            1
## Order bias  (q)                   2            2
## BW est. (h)                  17.754       17.754
## BW bias (b)                  28.028       28.028
## rho (h/b)                     0.633        0.633
## Unique Obs.                     595          665
## 
## =============================================================================
##         Method     Coef. Std. Err.         z     P>|z|      [ 95% C.I. ]       
## =============================================================================
##   Conventional     7.414     1.459     5.083     0.000     [4.555 , 10.273]    
##         Robust         -         -     4.311     0.000     [4.094 , 10.919]    
## =============================================================================

The results suggest that winning the previous Senate election increases the Democratic Party’s vote share in the next election by approximately 7.41 percentage points, this result is statistically significant.

Without using “rdrobust”

library(dplyr)

# Create a smaller sample around the cutoff (e.g., within 5 margin points)
bandwidth <- 5
subset_data <- rdrobust_RDsenate %>%
  filter(margin >= -bandwidth & margin <= bandwidth)

# Create a dummy for being above the cutoff (margin >= 0)
subset_data$treated <- ifelse(subset_data$margin >= 0, 1, 0)

# Run a simple linear regression of outcome on treatment dummy
manual_model <- lm(vote ~ treated, data = subset_data)

# View the result
summary(manual_model)
## 
## Call:
## lm(formula = vote ~ treated, data = subset_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.985  -5.033   0.165   5.767  26.264 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  44.9854     0.8124  55.374  < 2e-16 ***
## treated       7.7862     1.1756   6.623 2.24e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.191 on 243 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.1529, Adjusted R-squared:  0.1494 
## F-statistic: 43.87 on 1 and 243 DF,  p-value: 2.239e-10

(c) Is the running variable continuous at the threshold?

Plot a Histogram

# Plot histogram for running variable (margin)
hist(rdrobust_RDsenate$margin,
     breaks = 40,                        # number of bins
     main = "Histogram of Running Variable (Margin)",
     xlab = "Democratic Margin",
     col = "lightblue",
     border = "white")
abline(v = 0, col = "red", lwd = 2, lty = 2) # Vertical line at threshold

Discontinuity Test

library(rdd)
## Warning: package 'rdd' was built under R version 4.4.3
## Loading required package: AER
## Warning: package 'AER' was built under R version 4.4.3
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:boot':
## 
##     logit
## The following object is masked from 'package:Jmisc':
## 
##     recode
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## The following object is masked from 'package:boot':
## 
##     aml
## Loading required package: Formula
# Run McCrary Density Test at threshold = 0
DCdensity(rdrobust_RDsenate$margin, cutpoint = 0)

## [1] 0.3897849

The result value is 0.38 which is greater than 0.05 which means that we cannot reject the null hypothesis. Similarly, in the graph, there is no jump around the threshold, suggesting no discontinuity.

Part ii. Conventional and Robust RDD (Fuzzy design)

# Load required libraries
library(RDHonest)
## Warning: package 'RDHonest' was built under R version 4.4.3
# Load the built-in data
data(rcp)

(a) Plots

Treatment - Running Variable

plot(rcp$elig_year, rcp$retired, 
     xlab = "Years from Pension Eligibility (elig_year)", 
     ylab = "Probability of Retirement (retired)", 
     main = "Treatment vs Running Variable", 
     pch = 20, col = "blue")

# vertical line at threshold = 0
abline(v = 0, lty = 2, col = "red")

In this graph, the jump is not sharp at the threshold. It looks like that there may be some retirements happening before the threshold as well. While you are eligible to retire at 0, but that doesn’t necessarily mean that people choose to get retired at that point exactly.

Outcome - Running Variable

plot(rcp$elig_year, rcp$cn, 
     xlab = "Years from Pension Eligibility (elig_year)", 
     ylab = "Household Consumption (cn)", 
     main = "Outcome vs Running Variable", 
     pch = 20, col = "darkgreen")

# vertical line at threshold = 0
abline(v = 0, lty = 2, col = "red")

This graph shows considerable variation in both pre and post threshold and the change in household consumption is not that obvious.

Both the graphs show that the change does not happen deterministically as required in sharp RDD, therefore, there is a need of fuzzy RDD where the discontinuity in the probability of treatment at the cutoff can be used as an instrument to estimate the causal effect of retirement on household consumption.

(b) Fuzzy RDD

library(RDHonest)
library(rdrobust)

fuzzy_result <- rdrobust(
  y = rcp$cn,
  x = rcp$elig_year,
  fuzzy = rcp$retired
)
## Warning in rdrobust(y = rcp$cn, x = rcp$elig_year, fuzzy = rcp$retired): Mass
## points detected in the running variable.
# Print the result
summary(fuzzy_result)
## Fuzzy RD estimates using local polynomial regression.
## 
## Number of Obs.                30006
## BW type                       mserd
## Kernel                   Triangular
## VCE method                       NN
## 
## Number of Obs.                16556        13450
## Eff. Number of Obs.            1599         2078
## Order est. (p)                    1            1
## Order bias  (q)                   2            2
## BW est. (h)                   4.950        4.950
## BW bias (b)                  15.002       15.002
## rho (h/b)                     0.330        0.330
## Unique Obs.                      39           49
## 
## First-stage estimates.
## 
## =============================================================================
##         Method     Coef. Std. Err.         z     P>|z|      [ 95% C.I. ]       
## =============================================================================
##   Conventional     0.313     0.039     7.936     0.000     [0.235 , 0.390]     
##         Robust         -         -     7.094     0.000     [0.212 , 0.374]     
## =============================================================================
## 
## Treatment effect estimates.
## 
## =============================================================================
##         Method     Coef. Std. Err.         z     P>|z|      [ 95% C.I. ]       
## =============================================================================
##   Conventional -5603.339  3072.345    -1.824     0.068[-11625.025 , 418.346]   
##         Robust         -         -    -1.837     0.066[-12222.336 , 396.082]   
## =============================================================================

The results show that in the first stage, eligibility strongly predicts retirement and in the second stage, it shows that retirement reduces the household consumption by 5603 euros per year.

(c) Conventional Fuzzy RDD

library(RDHonest)
library(dplyr)

# Define the running variable and create a treatment assignment
rcp <- rcp %>% 
  mutate(threshold = ifelse(elig_year >= 0, 1, 0))  # eligibility threshold indicator

# First Stage Regression (eligibility -> actual retirement)
first_stage <- lm(retired ~ threshold + elig_year, data = rcp)
summary(first_stage)
## 
## Call:
## lm(formula = retired ~ threshold + elig_year, data = rcp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.04483 -0.06584  0.02257  0.16413  0.99846 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.1542421  0.0036542   42.21   <2e-16 ***
## threshold   0.5289227  0.0064418   82.11   <2e-16 ***
## elig_year   0.0080369  0.0001831   43.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2852 on 30003 degrees of freedom
## Multiple R-squared:  0.6521, Adjusted R-squared:  0.6521 
## F-statistic: 2.812e+04 on 2 and 30003 DF,  p-value: < 2.2e-16
# Predicted retirement status
rcp$predicted_retired <- predict(first_stage)

# Second Stage Regression (predicted retirement -> consumption)
second_stage <- lm(cn ~ predicted_retired + elig_year, data = rcp)
summary(second_stage)
## 
## Call:
## lm(formula = cn ~ predicted_retired + elig_year, data = rcp)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -18073  -5959  -2062   3499 311960 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       20449.039    186.401  109.70   <2e-16 ***
## predicted_retired -6536.304    420.760  -15.54   <2e-16 ***
## elig_year           122.177      9.388   13.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9852 on 30003 degrees of freedom
## Multiple R-squared:  0.008644,   Adjusted R-squared:  0.008578 
## F-statistic: 130.8 on 2 and 30003 DF,  p-value: < 2.2e-16
# This second_stage output gives you the CONVENTIONAL (non-robust) estimate!

Part iii. RDD and DID

(b) Short Description

Ruan et al., 2021 looks at COVID-19 pandemic and the resulting nationwide lockdowns in China affected wholesale vegetable prices. It investigates whether vegetable prices in wholesale markets increased, decreased, or remained stable during the lockdown period, and whether the impact varied across different types of vegetables. The research question they look at is if COVID-19 lockdowns significantly disrupt agricultural markets in China by affecting the supply, distribution, or pricing of vegetables. The authors study to provide causal evidence by using a regression discontinuity design (RDD) around the lockdown start date, exploiting the sudden and widespread implementation of lockdown policies.

(c) Data Description

The paper uses daily wholesale price data for vegetables from periods November 2019 to March 2020 collected from 151 wholesale markets across China for 70 different types of vegetables. Additional data was used on market characteristics and regional lockdown policies to strengthen their identification strategy. The high-frequency, market-level data enables a close examination of price movements immediately before and after the nationwide lockdowns, which came into effect in January 2020.

(d) RDD vs DID

The authors discuss that using RDD only could lead to issues in identification since the timing of Wuhan lockdown coincides with the Chinese new year which would inevitably affect the vegetable prices. Similarly, DID would also have its own limitations since due to rapid spread of the COVID-19 virus, there were virtually no untreated groups since the places were experiencing lockdown one after another.

Therefore, combining RDD and DID helps address both these limitations. RDD exploits the timing discontinuity in a narrow window while DID uses longer pre and post periods to control for broader time trends and seasonal effects. Together, they allow to credibly isolate the causal effect of the lockdowns on vegetable prices, minimizing bias from both seasonal shocks and nationwide time trends.

(e) Main Findings

The paper finds that the COVID-19 pandemic and the nationwide lockdowns led to a significant increase in wholesale vegetable prices in China. Using both RDD and DID, the authors show that vegetable prices rose sharply since the lockdowns (started on January 23, 2020). The increase in prices was particularly notable for leafy vegetables, which are more perishable and sensitive to supply chain disruptions.

The study also finds that the price increases were temporary, peaking shortly after the lockdowns and gradually declining as supply chains adapted to the new restrictions. These results suggest that while lockdowns caused short-term market disruptions and increased food prices, the strong adaptability of agricultural markets in China helped recover the prices relatively quickly. Broadly, the paper discusses importance of resilient supply chains to have stable markets during public health crises.

(f) Robustness Checks

The authors run several robustness checks to validate their results.

For DID, they test the parallel trends assumption by examining pre-lockdown price trends across different markets. They show that before the lockdown the price trajectories of vegetables were relatively stable, with no clear diverging trends. This supports the claim that, in the absence of the lockdown, price movements would have continued similarly across markets, satisfying the parallel trends assumption.

For RDD, authors ensure that the timing of the lockdown was exogenous to vegetable price trends. They perform placebo tests by applying the same RDD method at alternative cutoff dates before the actual lockdown date. These placebo tests show no significant price jumps on dates other than January 23rd 2020, suggesting that the observed discontinuity is indeed due to the lockdown and not driven by unrelated temporal shocks.