\[ \beta_{R} - \beta_{True} = (X'X+\lambda I)^{-1}X'Y - (X'X)^{-1}X'Y = (\lambda I)^{-1}X'Y \neq 0 \]
# The difference is not equal to zero, which is biased.
housingdata <- readRDS("testdata20250121.RDS")
housingdata$log_sale_amount <- log(housingdata$sale_amount)
housingdata$log_land_square_footage <- log(housingdata$land_square_footage)
train <- housingdata[housingdata$abbr != "CA" & housingdata$abbr != "CO" & housingdata$abbr != "NY", ]
train <- subset(train, select = c(log_sale_amount, year_built, bedrooms_all_buildings, number_of_bathrooms, number_of_fireplaces, stories_number, log_land_square_footage, AC_presence))
train_predictors <- subset(train, select = (-log_sale_amount))
test <- housingdata[housingdata$abbr == "CA" | housingdata$abbr == "CO" | housingdata$abbr == "NY", ]
test <- subset(test, select = c(log_sale_amount, year_built, bedrooms_all_buildings, number_of_bathrooms, number_of_fireplaces, stories_number, log_land_square_footage, AC_presence))
test_AC <- test[test$AC_presence == 0, ]
test_noAC <- test[test$AC_presence == 1, ]
test_predictors <- subset(test, select = (-log_sale_amount))
test_AC_predictors <- subset(test_AC, select = (-log_sale_amount))
test_noAC_predictors <- subset(test_noAC, select = (-log_sale_amount))
tuneGrid <- data.frame(
alpha = c(1),
lambda = c(seq(0, 0.2, length = 100), seq(0.2, 2, length = 100))
)
set.seed(16802)
model <- train(
log_sale_amount ~ .,
data = train,
method = "glmnet",
tuneGrid = tuneGrid,
trControl = trainControl(
method = "cv",
number = 5,
#repeats = 5,
verboseIter = FALSE
)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
train_parameters <- model$bestTune
train_parameters
## alpha lambda
## 2 1 0.002020202
# (c). The best lambda is 0.002.
lasso_train <- glmnet(train_predictors, train$log_sale_amount, alpha = 1, lambda = train_parameters$lambda)
coef(lasso_train)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 5.322179322
## year_built 0.002794445
## bedrooms_all_buildings 0.025486767
## number_of_bathrooms 0.241166125
## number_of_fireplaces .
## stories_number 0.174046686
## log_land_square_footage 0.050173705
## AC_presence 0.023220885
exp(mean(train$log_sale_amount)) * 0.023220885
## [1] 5106.864
# (c). The MWTP is $5,106.86.
test_AC_predict <- predict(lasso_train, as.matrix(test_AC_predictors))
test_noAC_predict <- predict(lasso_train, as.matrix(test_noAC_predictors))
exp(mean(test_AC_predict)) - exp(mean(test_noAC_predict))
## [1] -20925.86
# (d). The difference is $20,925.86.
exp(mean(test_AC$log_sale_amount)) - exp(mean(test_noAC$log_sale_amount))
## [1] 82377.15
# (e). The difference is $82,377.15. They are not close.
housingdata_CA <- housingdata[housingdata$abbr == "CA", ]
ols_CA <- lm(log_sale_amount ~ AC_presence + year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + log_land_square_footage, data = housingdata_CA)
summary(ols_CA)$coef[[2]]
## [1] -0.3004015
exp(mean(housingdata_CA$log_sale_amount)) * summary(ols_CA)$coef[[2]]
## [1] -138097.2
housingdata_CO <- housingdata[housingdata$abbr == "CO", ]
ols_CO <- lm(log_sale_amount ~ AC_presence + year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + log_land_square_footage, data = housingdata_CO)
summary(ols_CO)$coef[[2]]
## [1] 0.06026728
exp(mean(housingdata_CO$log_sale_amount)) * summary(ols_CA)$coef[[2]]
## [1] -108731.9
housingdata_NY <- housingdata[housingdata$abbr == "NY", ]
ols_NY <- lm(log_sale_amount ~ AC_presence + year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + log_land_square_footage, data = housingdata_NY)
summary(ols_NY)$coef[[2]]
## [1] 0.08946925
exp(mean(housingdata_NY$log_sale_amount)) * summary(ols_NY)$coef[[2]]
## [1] 45273.02
# (f). The MWTP are ($138,097.20), ($108,731.90) and $45,273.02. They are not similar at all.
# EXPLANATION: We have too less explanatory variables. LASSO is useful when there are many predictors. Unfortunately, we only have 7 predictors, which let LASSO make less senses.
# EXPLANATION: Yes. There are identical. According to FWL theorem, To run the full regression is identical to run the regression on some of the Xs, collect the residuals, and then run the residuals on the Xs left. But, if X1 and X2 are highly correlated, we may get into troubles.
#results <- treatDML(
# y = log_sale_amount,
# d = AC_presence,
# x = c(year_built,
# s = NULL,
# dtreat = 1,
# dcontrol = 0,
# trim = 0.01,
# MLmethod = "lasso",
# k = 3,
# normalized = TRUE
#)
# This method has bugs and does not work, especially it does not allow me to choose which dataset to use as "data = housingdata_CA" is prohibited.
data(rdrobust_RDsenate, package = "rdrobust")
rdplot(y = rdrobust_RDsenate$vote, x = rdrobust_RDsenate$margin, c = 0)
sharp_rd <- rdrobust(
y = rdrobust_RDsenate$vote,
x = rdrobust_RDsenate$margin,
c = 0
)
summary(sharp_rd)
## Sharp RD estimates using local polynomial regression.
##
## Number of Obs. 1297
## BW type mserd
## Kernel Triangular
## VCE method NN
##
## Number of Obs. 595 702
## Eff. Number of Obs. 360 323
## Order est. (p) 1 1
## Order bias (q) 2 2
## BW est. (h) 17.754 17.754
## BW bias (b) 28.028 28.028
## rho (h/b) 0.633 0.633
## Unique Obs. 595 665
##
## =============================================================================
## Method Coef. Std. Err. z P>|z| [ 95% C.I. ]
## =============================================================================
## Conventional 7.414 1.459 5.083 0.000 [4.555 , 10.273]
## Robust - - 4.311 0.000 [4.094 , 10.919]
## =============================================================================
# INTERPRETATION: To win the last election would increase the probability to get 7.41% more vote for the U.S. Senators.
rdrobust_RDsenate$over_cutoff <- ifelse(rdrobust_RDsenate$margin >= 0, 1, 0)
rdrobust_RDsenate$margin_below_cutoff <- ifelse(rdrobust_RDsenate$margin >= 0, 0, rdrobust_RDsenate$margin)
rdrobust_RDsenate$margin_over_cutoff <- ifelse(rdrobust_RDsenate$margin >= 0, rdrobust_RDsenate$margin, 0)
sharp_rd <- lm(vote ~ over_cutoff + margin_below_cutoff + margin_over_cutoff, data = rdrobust_RDsenate)
summary(sharp_rd)
##
## Call:
## lm(formula = vote ~ over_cutoff + margin_below_cutoff + margin_over_cutoff,
## data = rdrobust_RDsenate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44.655 -6.279 0.524 7.329 43.202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.90423 0.69921 64.221 < 2e-16 ***
## over_cutoff 6.04399 0.94225 6.414 1.98e-10 ***
## margin_below_cutoff 0.21630 0.02772 7.803 1.24e-14 ***
## margin_over_cutoff 0.38673 0.01502 25.753 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.65 on 1293 degrees of freedom
## (93 observations deleted due to missingness)
## Multiple R-squared: 0.5874, Adjusted R-squared: 0.5864
## F-statistic: 613.6 on 3 and 1293 DF, p-value: < 2.2e-16
hist(rdrobust_RDsenate$margin)
DCdensity(rdrobust_RDsenate$margin, 0)
## [1] 0.3897849
# INTERPRETATION: The distribution does not show any anomality, as the density along the running variable does not have a significant discontinuity.
data(rcp, package = "RDHonest")
rdplot(y = rcp$retired, x = rcp$elig_year, c = 0)
## [1] "Mass points detected in the running variable."
rdplot(y = rcp$cn, x = rcp$elig_year, c = 0)
## [1] "Mass points detected in the running variable."
fuzzy_rd <- rdrobust(y = rcp$cn, x = rcp$elig_year, c = 0)
## Warning in rdrobust(y = rcp$cn, x = rcp$elig_year, c = 0): Mass points detected
## in the running variable.
summary(fuzzy_rd)
## Sharp RD estimates using local polynomial regression.
##
## Number of Obs. 30006
## BW type mserd
## Kernel Triangular
## VCE method NN
##
## Number of Obs. 16556 13450
## Eff. Number of Obs. 4259 4854
## Order est. (p) 1 1
## Order bias (q) 2 2
## BW est. (h) 9.121 9.121
## BW bias (b) 17.002 17.002
## rho (h/b) 0.536 0.536
## Unique Obs. 39 49
##
## =============================================================================
## Method Coef. Std. Err. z P>|z| [ 95% C.I. ]
## =============================================================================
## Conventional -950.613 593.765 -1.601 0.109 [-2114.371 , 213.145]
## Robust - - -1.079 0.281 [-2116.897 , 613.538]
## =============================================================================
# INTERPRETATION: Retirement would decrease the household consumption by $950.61.
rcp$below_cutoff <- ifelse(rcp$elig_year >= 0, 1, 0)
fuzzy_rd <- ivreg(cn ~ retired | below_cutoff, data = rcp)
summary(fuzzy_rd)
##
## Call:
## ivreg(formula = cn ~ retired | below_cutoff, data = rcp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17546 -5936 -2072 3505 313198
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18382.49 78.95 232.8 <2e-16 ***
## retiredTRUE -1756.20 147.63 -11.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9812 on 30004 degrees of freedom
## Multiple R-Squared: 0.01657, Adjusted R-squared: 0.01653
## Wald test: 141.5 on 1 and 30004 DF, p-value: < 2.2e-16
# CITATION: Ruan, Jianqing, Qingwen Cai, and Songqing Jin. "Impact of COVID‐19 and nationwide lockdowns on vegetable prices: evidence from wholesale markets in China." American journal of agricultural economics 103.5 (2021): 1574-1594. https://doi.org/10.1111/ajae.12211.
# DESCRIPTION: How much did the lockdown policy cause the price fluctuation of vegetables in China during the COVID-19?
# DESCRIPTION: The author collected daily Chinese cabbage price data from 151 Chinese wholesale markets from December 1 to May 7 for 2020 from the National Agricultural Products Price Database using web crawling method.
# ARGUMENT: A key limitation of the RD estimates is that it does not allow us to elucidate much about how the effects of the nationwide lockdown policy on vegetable price vary over time or respond to the relaxation or lift of the lockdown policy. Using difference-in-differences only, the full treated and control groups are not comparable as the observations around the cutoff.
# FINDING: The lockdown policy caused a large and immediate surge in price and price dispersion of Chinese cabbage, though they fluctuated smoothly for the same period in normal years.
# The discontinuity in difference is the robustness check for the result. The authors basically checked whether there is a discontinuity in the price differences after the lockdown.