Nhập dữ liệu Ameshousing và chạy phân tích
library(AmesHousing)
data("ames_raw")
head(ames_raw)
## # A tibble: 6 × 82
## Order PID `MS SubClass` `MS Zoning` `Lot Frontage` `Lot Area` Street Alley
## <int> <chr> <chr> <chr> <int> <int> <chr> <chr>
## 1 1 052630… 020 RL 141 31770 Pave <NA>
## 2 2 052635… 020 RH 80 11622 Pave <NA>
## 3 3 052635… 020 RL 81 14267 Pave <NA>
## 4 4 052635… 020 RL 93 11160 Pave <NA>
## 5 5 052710… 060 RL 74 13830 Pave <NA>
## 6 6 052710… 060 RL 78 9978 Pave <NA>
## # ℹ 74 more variables: `Lot Shape` <chr>, `Land Contour` <chr>,
## # Utilities <chr>, `Lot Config` <chr>, `Land Slope` <chr>,
## # Neighborhood <chr>, `Condition 1` <chr>, `Condition 2` <chr>,
## # `Bldg Type` <chr>, `House Style` <chr>, `Overall Qual` <int>,
## # `Overall Cond` <int>, `Year Built` <int>, `Year Remod/Add` <int>,
## # `Roof Style` <chr>, `Roof Matl` <chr>, `Exterior 1st` <chr>,
## # `Exterior 2nd` <chr>, `Mas Vnr Type` <chr>, `Mas Vnr Area` <int>, …
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(MatchIt)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(EValue)
library(miceadds)
## * miceadds 3.18-36 (2025-09-12 09:54:54)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MatchThem)
##
## Attaching package: 'MatchThem'
##
## The following objects are masked from 'package:mice':
##
## cbind, pool
##
## The following object is masked from 'package:base':
##
## cbind
library(cobalt)
## cobalt (Version 4.6.1, Build Date: 2025-08-20)
##
## Attaching package: 'cobalt'
##
## The following object is masked from 'package:MatchIt':
##
## lalonde
library(broom)
attach(ames_raw)
Chọn biến liên quan đưa vào phân tích
data_for_mi <- ames_raw[, c("SalePrice", "Garage Cars", "Lot Area", "Overall Qual", "Year Built", "Total Bsmt SF", "Full Bath", "Lot Frontage")]
data_for_mi$Garage_Treated <- as.numeric(data_for_mi$`Garage Cars` > 0)
data_for_mi <- data_for_mi %>%
select(-`Garage Cars`)
head(data_for_mi)
## # A tibble: 6 × 8
## SalePrice `Lot Area` `Overall Qual` `Year Built` `Total Bsmt SF` `Full Bath`
## <int> <int> <int> <int> <int> <int>
## 1 215000 31770 6 1960 1080 1
## 2 105000 11622 5 1961 882 1
## 3 172000 14267 6 1958 1329 1
## 4 244000 11160 7 1968 2110 2
## 5 189900 13830 5 1997 928 2
## 6 195500 9978 6 1998 926 2
## # ℹ 2 more variables: `Lot Frontage` <int>, Garage_Treated <dbl>
summary(data_for_mi)
## SalePrice Lot Area Overall Qual Year Built
## Min. : 12789 Min. : 1300 Min. : 1.000 Min. :1872
## 1st Qu.:129500 1st Qu.: 7440 1st Qu.: 5.000 1st Qu.:1954
## Median :160000 Median : 9436 Median : 6.000 Median :1973
## Mean :180796 Mean : 10148 Mean : 6.095 Mean :1971
## 3rd Qu.:213500 3rd Qu.: 11555 3rd Qu.: 7.000 3rd Qu.:2001
## Max. :755000 Max. :215245 Max. :10.000 Max. :2010
##
## Total Bsmt SF Full Bath Lot Frontage Garage_Treated
## Min. : 0 Min. :0.000 Min. : 21.00 Min. :0.0000
## 1st Qu.: 793 1st Qu.:1.000 1st Qu.: 58.00 1st Qu.:1.0000
## Median : 990 Median :2.000 Median : 68.00 Median :1.0000
## Mean :1052 Mean :1.567 Mean : 69.22 Mean :0.9464
## 3rd Qu.:1302 3rd Qu.:2.000 3rd Qu.: 80.00 3rd Qu.:1.0000
## Max. :6110 Max. :4.000 Max. :313.00 Max. :1.0000
## NA's :1 NA's :490 NA's :1
ps_formula <- Garage_Treated ~ `Lot Area`+ `Overall Qual`+ `Year Built`+ `Total Bsmt SF` + `Full Bath` + `Lot Frontage`
Thực hiện mutiple imputation và chạy PSA bằng hàm “matchthem” trong
package “MatchThem” (nhằm trách lỗi)
imputed_data <- mice(data_for_mi, m=5, seed = 123, method = "pmm")
##
## iter imp variable
## 1 1 Total Bsmt SF Lot Frontage Garage_Treated
## 1 2 Total Bsmt SF Lot Frontage Garage_Treated
## 1 3 Total Bsmt SF Lot Frontage Garage_Treated
## 1 4 Total Bsmt SF Lot Frontage Garage_Treated
## 1 5 Total Bsmt SF Lot Frontage Garage_Treated
## 2 1 Total Bsmt SF Lot Frontage Garage_Treated
## 2 2 Total Bsmt SF Lot Frontage Garage_Treated
## 2 3 Total Bsmt SF Lot Frontage Garage_Treated
## 2 4 Total Bsmt SF Lot Frontage Garage_Treated
## 2 5 Total Bsmt SF Lot Frontage Garage_Treated
## 3 1 Total Bsmt SF Lot Frontage Garage_Treated
## 3 2 Total Bsmt SF Lot Frontage Garage_Treated
## 3 3 Total Bsmt SF Lot Frontage Garage_Treated
## 3 4 Total Bsmt SF Lot Frontage Garage_Treated
## 3 5 Total Bsmt SF Lot Frontage Garage_Treated
## 4 1 Total Bsmt SF Lot Frontage Garage_Treated
## 4 2 Total Bsmt SF Lot Frontage Garage_Treated
## 4 3 Total Bsmt SF Lot Frontage Garage_Treated
## 4 4 Total Bsmt SF Lot Frontage Garage_Treated
## 4 5 Total Bsmt SF Lot Frontage Garage_Treated
## 5 1 Total Bsmt SF Lot Frontage Garage_Treated
## 5 2 Total Bsmt SF Lot Frontage Garage_Treated
## 5 3 Total Bsmt SF Lot Frontage Garage_Treated
## 5 4 Total Bsmt SF Lot Frontage Garage_Treated
## 5 5 Total Bsmt SF Lot Frontage Garage_Treated
matched_imputations <- matchthem(ps_formula,
datasets = imputed_data,
approach = "within",
method = "full",
distance = "glm")
##
## Matching Observations | dataset: #1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## #2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## #3
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## #4
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## #5
summary(matched_imputations)
## Summarizing | dataset: #1
##
## Call:
## matchthem(formula = ps_formula, datasets = imputed_data, approach = "within",
## method = "full", distance = "glm")
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.9536 0.8188 1.8651 0.2314 0.3458
## Lot Area 10285.6924 7714.5605 0.3198 6.0302 0.1552
## Overall Qual 6.1785 4.6178 1.1303 1.6070 0.1561
## Year Built 1972.8868 1944.3248 0.9695 0.8893 0.2312
## Total Bsmt SF 1066.6628 783.5159 0.6464 1.2092 0.1658
## Full Bath 1.5792 1.3439 0.4268 1.0430 0.0480
## Lot Frontage 70.5957 59.7707 0.4570 1.3241 0.0815
## eCDF Max
## distance 0.5882
## Lot Area 0.2740
## Overall Qual 0.4821
## Year Built 0.3873
## Total Bsmt SF 0.2947
## Full Bath 0.2338
## Lot Frontage 0.3155
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.9536 0.9529 0.0101 0.9416 0.0277
## Lot Area 10285.6924 9470.0631 0.1014 4.9649 0.0703
## Overall Qual 6.1785 5.7912 0.2805 1.7831 0.0420
## Year Built 1972.8868 1970.9598 0.0654 0.6013 0.0622
## Total Bsmt SF 1066.6628 974.3260 0.2108 1.8465 0.0683
## Full Bath 1.5792 1.1662 0.7493 1.8915 0.0839
## Lot Frontage 70.5957 66.4769 0.1739 2.5369 0.0604
## eCDF Max Std. Pair Dist.
## distance 0.2113 0.0204
## Lot Area 0.1787 0.5896
## Overall Qual 0.1551 0.6488
## Year Built 0.2074 0.8067
## Total Bsmt SF 0.2470 0.8709
## Full Bath 0.3974 1.1731
## Lot Frontage 0.2973 0.8694
##
## Sample Sizes:
## Control Treated
## All 157. 2773
## Matched (ESS) 17.89 2773
## Matched 157. 2773
## Unmatched 0. 0
## Discarded 0. 0
Kiểm tra sự cân bằng trên dữ liệu MI
love.plot(matched_imputations,
abs = TRUE,
threshold = 0.1,
sample.names = c("Chưa ghép", "Đã ghép"))

Phân tích kết quả và tổng hợp (pooling)
outcome_analysis <- with(matched_imputations, lm(SalePrice ~ Garage_Treated))
final_pool <- pool(outcome_analysis)
summary(final_pool, conf.int = TRUE)
## term estimate std.error statistic df p.value 2.5 %
## 1 (Intercept) 130064.02 6427.445 20.235728 686.1113 9.293301e-72 117444.20
## 2 Garage_Treated 55031.22 6595.920 8.343221 739.8269 3.531153e-16 42082.27
## 97.5 % conf.low conf.high
## 1 142683.84 117444.20 142683.84
## 2 67980.17 42082.27 67980.17
Dựa vào love.plot, biến “Full Bath” và “Overall Qual” có SMD >
0.1. Do đó mô hình cân bằng chưa tốt. Thực hiện tiếp Doubly Robust
Estimation để điều chỉnh 2 biến trên một lần nữa.
outcome_analysis_robust <- with(matched_imputations, lm(SalePrice ~ Garage_Treated + `Full Bath` + `Overall Qual`))
robust_final <- pool(outcome_analysis_robust)
summary(robust_final, conf.int = TRUE)
## term estimate std.error statistic df p.value
## 1 (Intercept) -126074.78 6007.8743 -20.984923 67.28843 3.084281e-31
## 2 Garage_Treated 26777.32 4796.9436 5.582162 34.90586 2.768992e-06
## 3 `Full Bath` 26123.97 1896.4119 13.775475 1557.81977 7.974953e-41
## 4 `Overall Qual` 39351.54 744.7309 52.839947 2906.45949 0.000000e+00
## 2.5 % 97.5 % conf.low conf.high
## 1 -138065.60 -114083.95 -138065.60 -114083.95
## 2 17038.06 36516.57 17038.06 36516.57
## 3 22404.19 29843.76 22404.19 29843.76
## 4 37891.29 40811.79 37891.29 40811.79
Phân tích dưới nhóm (subgroup)
subgroup_analysis <- with(matched_imputations, lm(SalePrice ~ Garage_Treated * `Year Built` + `Full Bath` + `Overall Qual`))
summary(pool(subgroup_analysis))
## term estimate std.error statistic df
## 1 (Intercept) 26205.53653 249110.8477 0.1051963 36.02310
## 2 Garage_Treated -571269.27561 256530.8453 -2.2269029 47.80795
## 3 `Year Built` -68.81465 126.0366 -0.5459893 36.90131
## 4 `Full Bath` 22965.41122 1965.7896 11.6825376 1279.22686
## 5 `Overall Qual` 37107.75393 840.0976 44.1707631 2193.05038
## 6 Garage_Treated:`Year Built` 304.31519 129.5870 2.3483459 49.99975
## p.value
## 1 9.168036e-01
## 2 3.069891e-02
## 3 5.883592e-01
## 4 4.936371e-30
## 5 2.160475e-305
## 6 2.285261e-02
subgroup_categorical <- with(matched_imputations, lm(SalePrice ~ Garage_Treated * cut(`Year Built`, breaks = c(-Inf, 1950, 1990, Inf), labels = c("Pre-1950", "1950-1990", "1990-Post")) + `Full Bath` + `Overall Qual`))
run_subgroup <- function(data_matched, year_min, year_max, label) {
res_pool <- summary(pool(subgroup_categorical), conf.int = TRUE)
return(res_pool)}
res <- summary(pool(subgroup_categorical), conf.int = TRUE)
results_forest <- data.frame()
groups <- c("Pre-1950", "1950-1990", "1990-Post")
cuts <- c(-Inf, 1950, 1990, Inf)
for (i in 1:3) {}
model_forest <- with(matched_imputations,
lm(SalePrice ~ Garage_Treated : cut(`Year Built`, breaks = c(-Inf, 1950, 1990, Inf)) + cut(`Year Built`, breaks = c(-Inf, 1950, 1990, Inf)) + `Full Bath` + `Overall Qual`))
res_forest <- summary(pool(model_forest), conf.int = TRUE)
plot_data <- res_forest[grep("Garage_Treated:", res_forest$term), ]
plot_data$Group <- c("Pre-1950", "1950-1990", "1990-Post")
ggplot(plot_data, aes(x = estimate, y = Group)) +
geom_point(size = 3, color = "blue") + # Điểm ước lượng
geom_errorbarh(aes(xmin = `2.5 %`, xmax = `97.5 %`), height = 0.2) + # Khoảng tin cậy
geom_vline(xintercept = 0, linetype = "dashed", color = "red") + # Đường 0 (không tác động)
labs(title = "Tác động của Garage theo Năm xây dựng (Forest Plot)",
x = "Mức tăng giá nhà (USD)",
y = "Năm xây dựng") +
theme_minimal()
## Warning: `geom_errobarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `height` was translated to `width`.

Phân tích E-value (Sensitivity Analysis)
Estimate <- 26777.32
SE <- 4796.9436
d <- 0.335
se <- 0.060
calc_Evalue <- function(RR) {
if (RR <= 1) return(1)
return(RR + sqrt(RR * (RR - 1)))
}
RR_estimate <- exp(0.91 * d)
E_point <- calc_Evalue(RR_estimate)
d_lower <- d - (1.96 * se)
RR_lower <- exp(0.91 * d_lower)
E_CI <- calc_Evalue(RR_lower)
cat("--- KẾT QUẢ TÍNH THỦ CÔNG ---\n")
## --- KẾT QUẢ TÍNH THỦ CÔNG ---
cat("E-value (Estimate):", round(E_point, 2), "\n")
## E-value (Estimate): 2.05
cat("E-value (Lower CI):", round(E_CI, 2), "\n")
## E-value (Lower CI): 1.74
Qui trình thực hiện:
Dữ liệu thô -> mice() -> imputed_data
imputed_data -> MatchThem::matchthem() ->
matched_imputations
matched_imputations -> with(…, lm(…)) -> outcome_analysis