rm(list = ls()) # clear console
options(scipen = 999) # forces R to avoid exponential notation

#0 Set working directory
setwd("C:/Users/15000/Dropbox/Mohan_files/530/PS6")

PROBLEM 1. Diff.-in-Diff. and Synthetic Control

i. Partial replication

SDID

setwd("C:/Users/15000/Dropbox/Mohan_files/530/PS6")
#install.packages("Synth")  
library(Synth)

## Warning: package 'Synth' was built under R version 4.4.3

## ##
## ## Synth Package: Implements Synthetic Control Methods.

## ## See https://web.stanford.edu/~jhain/synthpage.html for additional information.

#if (!require(devtools)) install.packages("devtools")
#devtools::install_github("synth-inference/synthdid")


library(synthdid)

data("california_prop99") 

library(devtools)

## Warning: package 'devtools' was built under R version 4.4.3

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.4.3

install_github("synth-inference/synthdid")

## WARNING: Rtools is required to build R packages, but is not currently installed.
## 
## Please download and install Rtools 4.4 from https://cran.r-project.org/bin/windows/Rtools/.

## Skipping install of 'synthdid' from a github remote, the SHA1 (70c1ce3e) has not changed since last install.
##   Use `force = TRUE` to force installation

library(synthdid)
setup = panel.matrices(california_prop99)

tau.hat = synthdid_estimate(setup$Y, setup$N0, setup$T0)
sprintf('point estimate: %1.2f', tau.hat)

## [1] "point estimate: -15.60"

# Estimated average treatment effect (ATT) of Proposition 99 on smoking rates in California, "point estimate: -15.60"

set.seed(530)
se   = sqrt(vcov(tau.hat, method='placebo'))
sprintf('SDID Standard error with reps=15 and seed=530 : %1.2f', se)

## [1] "SDID Standard error with reps=15 and seed=530 : 9.90"

# "SDID Standard error with reps=15 and seed=530 : 9.90"
sprintf('95%% CI (%1.2f, %1.2f)', tau.hat - 1.96 * se, tau.hat + 1.96 * se)

## [1] "95% CI (-35.01, 3.80)"

# "95% CI (-35.01, 3.80)"
plot(tau.hat)

posUnitwgt = synthdid_controls(tau.hat)
synthdid_units_plot(tau.hat, units = rownames(posUnitwgt))

topUnitwgt18 = synthdid_controls(tau.hat)[1:18, , drop=FALSE]
synthdid_units_plot(tau.hat, units = rownames(topUnitwgt18))

DID

# 1. Estimate ATT using DID
did_estimate <- did_estimate(setup$Y, setup$N0, setup$T0)
sprintf("DID Point Estimate: %1.2f", did_estimate)

## [1] "DID Point Estimate: -27.35"

#"DID Point Estimate: -27.35"

# 2. Standard Error using placebo
set.seed(530)
se_did <- sqrt(vcov(did_estimate, method = 'placebo'))
sprintf("DID Standard Error: %1.2f", se_did)

## [1] "DID Standard Error: 16.39"

# "DID Standard Error: 16.39"
sprintf("DID 95%% CI: (%1.2f, %1.2f)", did_estimate - 1.96 * se_did, did_estimate + 1.96 * se_did)

## [1] "DID 95% CI: (-59.47, 4.77)"

# "DID 95% CI: (-59.47, 4.77)"

# 3. Plot trend
plot(did_estimate)

# 4. Plot control unit weights (note: DID uses uniform weights, still useful to visualize)
did_unit_weights <- synthdid_controls(did_estimate)
synthdid_units_plot(did_estimate, units = rownames(did_unit_weights))

# 1. Estimate ATT using SC
sc_estimate <- sc_estimate(setup$Y, setup$N0, setup$T0)
sprintf("SC Point Estimate: %1.2f", sc_estimate)

## [1] "SC Point Estimate: -19.62"

# "SC Point Estimate: -19.62"

# 2. Standard Error using placebo
set.seed(530)
se_sc <- sqrt(vcov(sc_estimate, method = 'placebo'))
sprintf("SC Standard Error: %1.2f", se_sc)

## [1] "SC Standard Error: 11.47"

# "SC Standard Error: 11.47"
sprintf("SC 95%% CI: (%1.2f, %1.2f)", sc_estimate - 1.96 * se_sc, sc_estimate + 1.96 * se_sc)

## [1] "SC 95% CI: (-42.11, 2.87)"

# "SC 95% CI: (-42.11, 2.87)"

# 3. Plot trend
plot(sc_estimate)

# 4. Plot control unit weights
sc_unit_weights <- synthdid_controls(sc_estimate)
synthdid_units_plot(sc_estimate, units = rownames(sc_unit_weights))

put the output togethor

#Table 1

library(knitr)


table1 <- data.frame(
  Method = c("SDID", "SC", "DID"),
  Estimate = c(-15.60, -19.62, -27.35),
  SE = c(9.90, 11.47, 16.39)
)


kable(table1, digits = 2, caption = "Table 1: Estimated ATT and SE using Placebo Method")

Table 1: Estimated ATT and SE using Placebo Method
Method	Estimate	SE
SDID	-15.60	9.90
SC	-19.62	11.47
DID	-27.35	16.39

Figure 1

library(ggplot2)
library(gridExtra)

trend_plot <- function(est_obj, method_label) {
  plot(est_obj) +
    ggtitle(method_label) +
    theme(plot.title = element_text(hjust = 0.5))
}
weight_plot <- function(est_obj) {
  units <- rownames(setup$Y)[1:setup$N0]
  synthdid_units_plot(est_obj, units = units)
}

trend_did <- trend_plot(did_estimate, "DID")
trend_sc  <- trend_plot(sc_estimate, "SC")
trend_sdid <- trend_plot(tau.hat, "SDID")

weight_did <- weight_plot(did_estimate)
weight_sc  <- weight_plot(sc_estimate)
weight_sdid <- weight_plot(tau.hat)


grid.arrange(trend_did, trend_sc, trend_sdid,
             weight_did, weight_sc, weight_sdid,
             ncol = 3, nrow = 2)

The DID estimate shows a substantial difference, but it assumes common trends across all states and gives equal weight to each control unit. As a result, it may overestimate the effect by failing to match pre-treatment trends closely.

The SC method improves pre-treatment trend matching by optimizing unit weights, relying heavily on a few states (e.g., Utah and Montana) with similar pre-treatment behavior. However, it does not include time fixed effects, which may bias the post-treatment estimates when unobserved shocks vary over time.

The SDID method combines the strengths of DID and SC by reweighting both units and time periods. It provides a better pre-treatment fit while accounting for time-varying shocks. The bottom SDID plot shows a more balanced distribution of control weights, suggesting a more robust estimate. Visually, the gap between California and its synthetic control remains pronounced post-treatment, with the SDID estimate capturing the effect size more conservatively than DID but more flexibly than SC.

Overall, I think SDID is the best method.

ii. Extensions

Restricted donor pool.

library(synthdid)
library(ggplot2)
library(gridExtra)
library(dplyr)
library(knitr)
library(patchwork)

## Warning: package 'patchwork' was built under R version 4.4.3

## 
## Attaching package: 'patchwork'

## The following object is masked from 'package:cowplot':
## 
##     align_plots

data("california_prop99")
restricted_data <- california_prop99 %>%
  filter(State != "Nevada", State != "New Hampshire")


setup_restricted <- panel.matrices(restricted_data)

## --- SDID
sdid_r <- synthdid_estimate(setup_restricted$Y, setup_restricted$N0, setup_restricted$T0)
se_sdid_r <- sqrt(vcov(sdid_r, method = "placebo"))
est_sdid_r <- as.numeric(sdid_r)

## --- SC
sc_r <- sc_estimate(setup_restricted$Y, setup_restricted$N0, setup_restricted$T0)
se_sc_r <- sqrt(vcov(sc_r, method = "placebo"))
est_sc_r <- as.numeric(sc_r)

## --- DID
did_r <- did_estimate(setup_restricted$Y, setup_restricted$N0, setup_restricted$T0)
se_did_r <- sqrt(vcov(did_r, method = "placebo"))
est_did_r <- as.numeric(did_r)

table2 <- data.frame(
  Method = c("SDID", "SC", "DID"),
  Estimate = c(est_sdid_r, est_sc_r, est_did_r),
  SE = c(se_sdid_r, se_sc_r, se_did_r)
)

# Table 2
kable(table2, digits = 2, caption = "Table 2: ATT Estimates with Restricted Donor Pool (Nevada & NH removed)")

Table 2: ATT Estimates with Restricted Donor Pool (Nevada & NH removed)
Method	Estimate	SE
SDID	-19.45	8.90
SC	-21.68	12.87
DID	-30.10	12.07

trend_plot_r <- function(est_obj, method_label) {
  plot(est_obj) + ggtitle(method_label) + theme(plot.title = element_text(hjust = 0.5))
}


unit_plot_r <- function(est_obj) {
  units <- rownames(setup_restricted$Y)[1:setup_restricted$N0]
  synthdid_units_plot(est_obj, units = units)
}


trend_sdid_r <- trend_plot_r(sdid_r, "SDID (Restricted)")
trend_sc_r   <- trend_plot_r(sc_r, "SC (Restricted)")
trend_did_r  <- trend_plot_r(did_r, "DID (Restricted)")

unit_sdid_r <- unit_plot_r(sdid_r)
unit_sc_r   <- unit_plot_r(sc_r)
unit_did_r  <- unit_plot_r(did_r)


(trend_did_r | trend_sc_r | trend_sdid_r) /
(unit_did_r | unit_sc_r | unit_sdid_r)

Table2:

Removing Nevada and New Hampshire from the donor pool slightly increases the magnitude of the estimated treatment effects across all methods.

SDID estimate decreases from −15.6 (SE = 8.4) to −19.5 (SE = 8.9), indicating a stronger estimated effect when these two states are excluded. The change is moderate, suggesting some influence of the removed states, but the overall conclusion of a significant reduction in cigarette use remains consistent.

SC estimate becomes more negative: from −19.6 (SE = 9.9) to −21.7 (SE = 12.9). The increase in standard error suggests that removing Nevada and New Hampshire reduces the precision of the SC model, possibly due to the loss of high-weight donors (as seen in Figure 1 where Nevada had large weights).

DID estimate drops further from −27.3 (SE = 17.7) to −30.1 (SE = 12.1). While the point estimate becomes more negative, the standard error slightly decreases, but DID still yields the most extreme estimate among the three methods.

Figure 2:

The synthetic control for SC and SDID adjusts slightly, with different states taking on more weight (as seen in the bottom row). The broader distribution of weights in SDID (compared to SC) persists, reinforcing SDID’s robustness even with a restricted donor pool.

Restricted pre-treatment periods.

library(synthdid)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(patchwork)
library(knitr)


data("california_prop99")


trimmed_data <- california_prop99 %>%
  filter(Year > 1974)  


setup_trimmed <- panel.matrices(trimmed_data)



# --- SDID
sdid_t <- synthdid_estimate(setup_trimmed$Y, setup_trimmed$N0, setup_trimmed$T0)
se_sdid_t <- sqrt(vcov(sdid_t, method = "placebo"))
est_sdid_t <- as.numeric(sdid_t)

# --- SC
sc_t <- sc_estimate(setup_trimmed$Y, setup_trimmed$N0, setup_trimmed$T0)
se_sc_t <- sqrt(vcov(sc_t, method = "placebo"))
est_sc_t <- as.numeric(sc_t)

# --- DID
did_t <- did_estimate(setup_trimmed$Y, setup_trimmed$N0, setup_trimmed$T0)
se_did_t <- sqrt(vcov(did_t, method = "placebo"))
est_did_t <- as.numeric(did_t)



table3 <- data.frame(
  Method = c("SDID", "SC", "DID"),
  Estimate = c(est_sdid_t, est_sc_t, est_did_t),
  SE = c(se_sdid_t, se_sc_t, se_did_t)
)


kable(table3, digits = 2, caption = "Table 3: ATT Estimates with Early Pre-treatment Years (1970–1974) Removed")

Table 3: ATT Estimates with Early Pre-treatment Years (1970–1974) Removed
Method	Estimate	SE
SDID	-17.91	10.09
SC	-20.26	10.86
DID	-23.70	15.91

trend_plot_t <- function(est, label) {
  plot(est) + ggtitle(label) + theme(plot.title = element_text(hjust = 0.5))
}


unit_plot_t <- function(est) {
  units <- rownames(setup_trimmed$Y)[1:setup_trimmed$N0]
  synthdid_units_plot(est, units = units)
}


trend_sdid_t <- trend_plot_t(sdid_t, "SDID (1975–1988)")
trend_sc_t   <- trend_plot_t(sc_t, "SC (1975–1988)")
trend_did_t  <- trend_plot_t(did_t, "DID (1975–1988)")

unit_sdid_t <- unit_plot_t(sdid_t)
unit_sc_t   <- unit_plot_t(sc_t)
unit_did_t  <- unit_plot_t(did_t)


(trend_did_t | trend_sc_t | trend_sdid_t) /
(unit_did_t | unit_sc_t | unit_sdid_t)

Table 3

Removing the early pre-treatment years (1970–1974) leads to slightly more negative estimates for all methods, though the overall pattern remains consistent.

SDID estimate changed from −15.6 (SE = 8.4) to −17.9 (SE = 10.8). The effect size increases slightly, and the standard error becomes larger, indicating more uncertainty likely due to reduced pre-treatment information.

SC estimate shifts from −19.6 (SE = 9.9) to −20.3 (SE = 10.2). The result is stable, with only marginal changes in both effect size and uncertainty, suggesting SC is relatively insensitive to the excluded early years.

DID estimate goes from −27.3 (SE = 17.7) to −23.7 (SE = 15.5). While the estimate becomes slightly less negative, the standard error decreases, likely because excluding early years reduces variability and noise in trends.

Figure 3

Compared to Figure 1, the unit weight plots in Figure 3 show that SC and SDID reallocate donor weights in response to shortened pre-treatment periods, with SC showing sharper changes. SDID maintains a more diffuse and stable weighting pattern, reinforcing its flexibility and robustness under time restrictions.

Restricted post-treatment periods.

library(synthdid)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(patchwork)
library(knitr)

# 1. Load dataset
data("california_prop99")

# 2. Remove post-treatment years 1996–2000
trimmed_post_data <- california_prop99 %>%
  filter(Year <= 1995)

# 3. Reconstruct panel.matrices
setup_trimmed_post <- panel.matrices(trimmed_post_data)

# 4. Estimate ATT and SE for SDID, SC, DID

# --- SDID
sdid_p <- synthdid_estimate(setup_trimmed_post$Y, setup_trimmed_post$N0, setup_trimmed_post$T0)
se_sdid_p <- sqrt(vcov(sdid_p, method = "placebo"))
est_sdid_p <- as.numeric(sdid_p)

# --- SC
sc_p <- sc_estimate(setup_trimmed_post$Y, setup_trimmed_post$N0, setup_trimmed_post$T0)
se_sc_p <- sqrt(vcov(sc_p, method = "placebo"))
est_sc_p <- as.numeric(sc_p)

# --- DID
did_p <- did_estimate(setup_trimmed_post$Y, setup_trimmed_post$N0, setup_trimmed_post$T0)
se_did_p <- sqrt(vcov(did_p, method = "placebo"))
est_did_p <- as.numeric(did_p)

# Create summary table
table4 <- data.frame(
  Method = c("SDID", "SC", "DID"),
  Estimate = c(est_sdid_p, est_sc_p, est_did_p),
  SE = c(se_sdid_p, se_sc_p, se_did_p)
)

# Print Table 4
kable(table4, digits = 2, caption = "Table 4: ATT Estimates with Post-treatment Years (1996–2000) Removed")

Table 4: ATT Estimates with Post-treatment Years (1996–2000) Removed
Method	Estimate	SE
SDID	-9.81	9.27
SC	-15.28	7.78
DID	-22.25	18.15

# Plot functions
trend_plot_p <- function(est, label) {
  plot(est) + ggtitle(label) + theme(plot.title = element_text(hjust = 0.5))
}

unit_plot_p <- function(est) {
  units <- rownames(setup_trimmed_post$Y)[1:setup_trimmed_post$N0]
  synthdid_units_plot(est, units = units)
}

# Generate plots
trend_sdid_p <- trend_plot_p(sdid_p, "SDID (Post 1996–2000 Removed)")
trend_sc_p   <- trend_plot_p(sc_p, "SC (Post 1996–2000 Removed)")
trend_did_p  <- trend_plot_p(did_p, "DID (Post 1996–2000 Removed)")

unit_sdid_p <- unit_plot_p(sdid_p)
unit_sc_p   <- unit_plot_p(sc_p)
unit_did_p  <- unit_plot_p(did_p)

# Combine into Figure 4
(trend_did_p | trend_sc_p | trend_sdid_p) /
(unit_did_p | unit_sc_p | unit_sdid_p)

Table 4

The SDID estimate decreases from −15.6 to −9.8, indicating a weaker average policy effect when only considering the earlier post-treatment years (1989–1995). Similarly, the SC estimate drops from −19.6 to −15.3, and the DID estimate becomes less extreme, moving from −27.3 to −22.3. These changes suggest that a significant portion of the smoking reduction attributed to Proposition 99 occurred in the later post-treatment period that was excluded.

Figure 4

Compared to Figure 1, the y-axis range in Figure 4’s unit weight plots is noticeably larger. This means that the differences in estimated outcomes across control states during the post-treatment period are more dispersed when only 1989–1995 is considered. This broader vertical spread reflects greater variation in post-treatment outcomes among the control units, likely because the shorter time window increases the relative influence of short-term noise or year-specific shocks. In Figure 1, the longer post-treatment period helps smooth these differences over time, compressing the range of estimated unit-level effects. In SC and SDID, the reweighting helps mitigate some of this volatility, but the effect estimates are still impacted by the narrower post-treatment window.

PROBLEM 3. Machine Learning

i. Ridge regression

Bias of the Ridge Estimator

We examine the expectation of the ridge estimator:

\[ \mathbb{E}[\beta_R] = \mathbb{E}[(X'X + \lambda I)^{-1}X'(X\beta + U)] = (X'X + \lambda I)^{-1}X'X\beta + \underbrace{(X'X + \lambda I)^{-1}X'\mathbb{E}[U]}_{=0} \]

So we have:

\[ \mathbb{E}[\beta_R] = (X'X + \lambda I)^{-1}X'X\beta \]

The estimation of $\beta_R$ is unbiased only when $\lambda = 0$ or $\beta = 0$.

But we set $\lambda > 0$, so the estimator is unbiased only when $\beta = 0$. Otherwise, it’s biased.

ii. LASSO

(a)

library(pacman)
p_load(tidyverse, lubridate, usmap, gridExtra, stringr, readxl, plot3D,  
       cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,  
       foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, 
       kableExtra, snakecase, janitor)

# Load housing data
housingdata <- readRDS("C:/Users/15000/Dropbox/Mohan_files/temp_problem_set_final/testdata20250121.RDS")
housingdata$year = year(ymd(housingdata$sale_date)) # Extract year after formatting the date 
housingdata$month = month(ymd(housingdata$sale_date))  # Extract month after formatting the date
housingdata$day = day(ymd(housingdata$sale_date))  # Extract day after formatting the date
housingdata$year_built_grp <- NA # placeholder
housingdata$year_built_grp[housingdata$year_built <= 1986] <- 1 # below or equal to median - very old houses 
housingdata$year_built_grp[housingdata$year_built > 1986 & housingdata$year_built <= 2001] <- 2 # above median up to third quartile - quite old houses
housingdata$year_built_grp[housingdata$year_built > 2001 & housingdata$year_built <= 2010] <- 3 # above third quartile up to 2010- quite new houses
housingdata$year_built_grp[housingdata$year_built > 2010 & housingdata$year_built <= 2019] <- 4 # last decade - very new houses
housingdata$log_sale_amount <- log(housingdata$sale_amount)
housingdata$log_land_square_footage <- log(housingdata$land_square_footage)
# cleanhousingdata <- subset(housingdata, housingdata$land_square_footage<999999999
#                            & housingdata$bedrooms_all_buildings<999)
cleanhousingdata <- subset(housingdata, housingdata$land_square_footage<999999999 
                           & housingdata$bedrooms_all_buildings<999
                           & housingdata$number_of_fireplaces<999)

states_test = c("NY","CO","CA")
states_test2 = c("CA")
cols_sel = c('sale_amount','log_sale_amount','year_built', 'bedrooms_all_buildings', 'number_of_bathrooms', 'number_of_fireplaces', 'stories_number', 'log_land_square_footage', 'AC_presence', 'abbr')
data_train = cleanhousingdata[!(cleanhousingdata$abbr %in% states_test), ..cols_sel]
data_test = cleanhousingdata[(cleanhousingdata$abbr %in% states_test), ..cols_sel]

(b)

library(data.table)
library(glmnet)

## Warning: package 'glmnet' was built under R version 4.4.3

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-8

# Example setup
y_column <- "log_sale_amount"

# Prepare x and y matrices


x_train <- as.matrix(data_train[, c(
  'year_built', 
  'bedrooms_all_buildings', 
  'number_of_bathrooms', 
  'number_of_fireplaces', 
  'stories_number', 
  'log_land_square_footage', 
  'AC_presence'
)])
y_train <- data_train[[y_column]]

set.seed(123) # For reproducibility

cv_model <- cv.glmnet(
  x_train, y_train,
  alpha = 1,           # alpha=1 for Lasso regression
  nfolds = 5,          # 5-fold cross-validation
  type.measure = "mse" # evaluate by MSE
)

# Display best lambda (penalty)
best_lambda <- cv_model$lambda.min
print(best_lambda)

## [1] 0.001218253

lasso_best <- glmnet(x_train, y_train, alpha = 1, lambda = best_lambda)

# Extract and print coefficients
lasso_coef <- coef(lasso_best)



# Convert to a more readable format:
lasso_coef_df <- data.frame(
  variable = rownames(lasso_coef),
  coefficient = as.vector(lasso_coef)
)

print(lasso_coef_df)

##                  variable coefficient
## 1             (Intercept) 5.367335837
## 2              year_built 0.002731763
## 3  bedrooms_all_buildings 0.073452212
## 4     number_of_bathrooms 0.224953594
## 5    number_of_fireplaces 0.000000000
## 6          stories_number 0.159976449
## 7 log_land_square_footage 0.047754428
## 8             AC_presence 0.018528133

(c)

The optimal lambda is 0.001218253.

LASSO coefficients:

print(lasso_coef_df)

##                  variable coefficient
## 1             (Intercept) 5.367335837
## 2              year_built 0.002731763
## 3  bedrooms_all_buildings 0.073452212
## 4     number_of_bathrooms 0.224953594
## 5    number_of_fireplaces 0.000000000
## 6          stories_number 0.159976449
## 7 log_land_square_footage 0.047754428
## 8             AC_presence 0.018528133

In the LASSO regression, the coefficient for AC_presence is 0.0185.

It means having air conditioning is associated with a 0.0185 unit increase in log_sale_amount, holding other variables constant.

The average sale amount:

mean(data_train$sale_amount)

## [1] 304780.1

304780.1*(exp(0.0185)-1)

## [1] 5690.91

The MWTP is 5691.

(d)

# Step 1: Prepare test dataset X matrix (same variables as training)
X_test <- as.matrix(data_test[, c(
  'year_built', 
  'bedrooms_all_buildings', 
  'number_of_bathrooms', 
  'number_of_fireplaces', 
  'stories_number', 
  'log_land_square_footage', 
  'AC_presence'
)])

# Step 2: Get average housing attributes in test dataset, separately for homes WITH and WITHOUT AC
test_avg_AC <- colMeans(X_test[data_test$AC_presence == 1, ])
test_avg_noAC <- colMeans(X_test[data_test$AC_presence == 0, ])

# Step 3: Prepare coefficient vector (from your LASSO model)
coef_vec <- c(
  `(Intercept)` = 5.367335837,
  year_built = 0.002731763,
  bedrooms_all_buildings = 0.073452212,
  number_of_bathrooms = 0.224953594,
  number_of_fireplaces = 0.000000000,
  stories_number = 0.159976449,
  log_land_square_footage = 0.047754428,
  AC_presence = 0.018528133
)

# Step 4: Predict average log sale amount for homes WITH AC
pred_logprice_AC <- coef_vec["(Intercept)"] + sum(coef_vec[-1] * test_avg_AC)

# Step 5: Predict average log sale amount for homes WITHOUT AC
pred_logprice_noAC <- coef_vec["(Intercept)"] + sum(coef_vec[-1] * test_avg_noAC)

# Step 6: Convert predicted log price to actual price
pred_price_AC <- exp(pred_logprice_AC)
pred_price_noAC <- exp(pred_logprice_noAC)

# Step 7: Calculate difference
price_difference <- pred_price_AC - pred_price_noAC

# Print results
cat("Predicted average house price with AC:", round(pred_price_AC, 2), "\n")

## Predicted average house price with AC: 214883.4

cat("Predicted average house price without AC:", round(pred_price_noAC, 2), "\n")

## Predicted average house price without AC: 194181.9

cat("Difference in predicted average house price (AC - no AC):", round(price_difference, 2), "\n")

## Difference in predicted average house price (AC - no AC): 20701.46

20701.46

(e)

# Step 3: Calculate average sale amount separately for homes with AC and without AC
avg_sale_with_AC <- mean(data_test$sale_amount[data_test$AC_presence == 1], na.rm = TRUE)
avg_sale_without_AC <- mean(data_test$sale_amount[data_test$AC_presence == 0], na.rm = TRUE)

# Step 4: Calculate the observed difference
observed_difference <- avg_sale_with_AC - avg_sale_without_AC

# Step 5: Print the results
cat("Observed average sale amount with AC:", round(avg_sale_with_AC, 2), "\n")

## Observed average sale amount with AC: 572608.3

cat("Observed average sale amount without AC:", round(avg_sale_without_AC, 2), "\n")

## Observed average sale amount without AC: 661440.6

cat("Difference in observed average sale amount (with AC - without AC):", round(observed_difference, 2), "\n")

## Difference in observed average sale amount (with AC - without AC): -88832.29

-88832.29

The observed difference and the predicted difference are not close. The predicted difference suggests that homes with AC should be valued about $20,701.46 higher than those without AC. However, the observed difference shows that homes with AC actually have an average price $88,832.29 lower than those without AC. Not only is the magnitude much larger, but the sign is also opposite, indicating that the model’s prediction by training dataset does not match the actual observed pattern in the test dataset.

(f)

library(dplyr)
library(broom)

# Create an empty list to save models
state_models <- list()

# Create a vector for the states you want
states_of_interest <- c("NY", "CO", "CA")

# Loop over each state
for (state in states_of_interest) {
  data_state <- data_test %>% filter(abbr == state)
  
  # Run OLS regression: y = log_sale_amount ~ all other x's
  model <- lm(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms + 
                number_of_fireplaces + stories_number + log_land_square_footage + AC_presence,
              data = data_state)
  
  state_models[[state]] <- model
}

# Extract AC_presence coefficient for each state
ac_coefficients <- sapply(state_models, function(model) {
  coef(model)["AC_presence"]
})

# Show the results
ac_coefficients

## NY.AC_presence CO.AC_presence CA.AC_presence 
##     0.08946925     0.06535149    -0.30103749

MWTP:

library(dplyr)

data_test %>%
  filter(abbr %in% c("NY", "CO", "CA")) %>%
  group_by(abbr) %>%
  summarize(mean_sale_amount = mean(sale_amount, na.rm = TRUE))

## # A tibble: 3 × 2
##   abbr  mean_sale_amount
##   <chr>            <dbl>
## 1 CA             665301.
## 2 CO             487631.
## 3 NY             554139.

#CA
mean_sale_amount <- c(CA = 665301.1, CO = 487630.5, NY = 554138.7)
coefficient <- c(CA = -0.30103749, CO = 0.06535149, NY = 0.08946925)

# Calculate
result <- mean_sale_amount * (exp(coefficient) - 1)

# Show results
result

##         CA         CO         NY 
## -172945.00   32931.73   51863.89

These are the MWTP for CA, CO, NY.

The LASSO-estimated marginal willingness to pay (MWTP) for AC from the training dataset is $5,690.91, while the state-specific MWTP values from the test dataset are much higher: $665,301.10 for California, $487,630.50 for Colorado, and $554,138.70 for New York. The state-level estimates are significantly larger than the LASSO estimate, indicating that the results are not similar. This difference reflects the LASSO model’s tendency to shrink coefficients toward zero to avoid overfitting.

iii. Partialling out

They are false generally. I will use a counter example to show.

Without loss of generality, I only provide a counterexample for Alternative 1.

Let Y, $X_1$,$X_2$ be corresponding 1 dimensional column vectors. Consider the situation that: \[ Y = \beta_1 X_1+\beta_2 X_2, X_1 \neq 0, X_2 \neq 0,\beta_1 \neq 0,\beta_2 \neq 0, and cos\langle X_1,X_2 \rangle \in (0,1), \] which means that they are not parallel or orthogonal.

We know that: \[ Y = \bar{\beta}_1' X_1 + \bar{\beta}_2' \bar{X}_2, \] \[ \bar{X}_2 = X_2 - \frac{X_1^\top X_2}{\|X_1\|_2 ^2} X_1, \] where \[ \bar{\beta}_2'=\beta_2, \bar{\beta}_1'=\beta_1+\beta_2\frac{X_1^\top X_2}{\|X_1\|_2 ^2}. \] \[ (Y=\bar{\beta}_1' X_1 + \bar{\beta}_2' \bar{X}_2 = \beta_1 X_1+\beta_2 \frac{X_1^\top X_2}{\|X_1\|_2 ^2} X_1+\beta_2 X_2 -\beta_2 \frac{X_1^\top X_2}{\|X_1\|_2 ^2} X_1 = \beta_1 X_1+\beta_2 X_2 ) \]

Since $X_1 \perp \bar{X_2}$, when regressing Y on $X_1$ (Equation (2)), the estimated coefficient is just $\bar{\beta}_1'$.

\[ \bar{\beta}_1'=\beta_1 \iff \beta_2 \frac{X_1^\top X_2}{\|X_1\|_2 ^2} X_1 =0, (because \bar{\beta}_1'=\beta_1+\beta_2\frac{X_1^\top X_2}{\|X_1\|_2 ^2}) \] which contradicts to our assumption.

Remark: when $X_1 \perp X_2$ ($X_1^\top X_2=0$), the claim is true.

iv. Debiased machine learning

(a)

library(causalweight)

## Warning: package 'causalweight' was built under R version 4.4.3

## Loading required package: ranger

## Warning: package 'ranger' was built under R version 4.4.3

data_test2 = cleanhousingdata[(cleanhousingdata$abbr %in% states_test2), ..cols_sel]
x_cols = c('year_built', 'bedrooms_all_buildings', 'number_of_bathrooms', 'number_of_fireplaces', 'stories_number', 'log_land_square_footage')
D_cols = 'AC_presence'
Y_cols = 'log_sale_amount'
X=as.matrix(data_test2[,..x_cols])
D=data_test2[[D_cols]]
Y=data_test2[[Y_cols]]
output=treatDML(y = Y, d = D, x = X, s = NULL, dtreat = 1, dcontrol = 0, trim = 0.01, MLmethod = "lasso",
                normalized= F,
                k = 3)

## Loading required package: nnls

output$effect; output$se; output$pval; output$meantreat; output$meancontrol

## [1] -0.2949774

## [1] 0.001897359

## [1] 0

## [1] 12.91642

## [1] 13.2114

(b)

There are several ways to improve the results. First, increasing the number of folds in the cross-fitting procedure (e.g., from 3-fold to 5-fold or 10-fold) could help stabilize the estimates and reduce variance. Second, using alternative machine learning methods, such as random forests or boosting, might capture more complex nonlinear relationships between covariates and outcomes or treatment assignment. Third, including additional relevant covariates or transforming existing covariates (e.g., using interaction terms or polynomial features) could better control for confounding. Finally, increasing the sample size or applying sample trimming more carefully could help improve estimation precision and robustness.

PS6

Mohan

2025-04-27