Nhập dữ liệu Ameshousing và chạy phân tích

library(AmesHousing)
data("ames_raw")
head(ames_raw)
## # A tibble: 6 × 82
##   Order PID     `MS SubClass` `MS Zoning` `Lot Frontage` `Lot Area` Street Alley
##   <int> <chr>   <chr>         <chr>                <int>      <int> <chr>  <chr>
## 1     1 052630… 020           RL                     141      31770 Pave   <NA> 
## 2     2 052635… 020           RH                      80      11622 Pave   <NA> 
## 3     3 052635… 020           RL                      81      14267 Pave   <NA> 
## 4     4 052635… 020           RL                      93      11160 Pave   <NA> 
## 5     5 052710… 060           RL                      74      13830 Pave   <NA> 
## 6     6 052710… 060           RL                      78       9978 Pave   <NA> 
## # ℹ 74 more variables: `Lot Shape` <chr>, `Land Contour` <chr>,
## #   Utilities <chr>, `Lot Config` <chr>, `Land Slope` <chr>,
## #   Neighborhood <chr>, `Condition 1` <chr>, `Condition 2` <chr>,
## #   `Bldg Type` <chr>, `House Style` <chr>, `Overall Qual` <int>,
## #   `Overall Cond` <int>, `Year Built` <int>, `Year Remod/Add` <int>,
## #   `Roof Style` <chr>, `Roof Matl` <chr>, `Exterior 1st` <chr>,
## #   `Exterior 2nd` <chr>, `Mas Vnr Type` <chr>, `Mas Vnr Area` <int>, …
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(MatchIt)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(EValue)
library(miceadds)
## * miceadds 3.18-36 (2025-09-12 09:54:54)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MatchThem)
## 
## Attaching package: 'MatchThem'
## 
## The following objects are masked from 'package:mice':
## 
##     cbind, pool
## 
## The following object is masked from 'package:base':
## 
##     cbind
library(cobalt)
##  cobalt (Version 4.6.1, Build Date: 2025-08-20)
## 
## Attaching package: 'cobalt'
## 
## The following object is masked from 'package:MatchIt':
## 
##     lalonde
library(broom)
attach(ames_raw)

Chọn biến liên quan đưa vào phân tích

data_for_mi <- ames_raw[, c("SalePrice", "Garage Cars", "Lot Area", "Overall Qual", "Year Built", "Total Bsmt SF", "Full Bath", "Lot Frontage")]
data_for_mi$Garage_Treated <- as.numeric(data_for_mi$`Garage Cars` > 0)
data_for_mi <- data_for_mi %>%
  select(-`Garage Cars`)
head(data_for_mi)
## # A tibble: 6 × 8
##   SalePrice `Lot Area` `Overall Qual` `Year Built` `Total Bsmt SF` `Full Bath`
##       <int>      <int>          <int>        <int>           <int>       <int>
## 1    215000      31770              6         1960            1080           1
## 2    105000      11622              5         1961             882           1
## 3    172000      14267              6         1958            1329           1
## 4    244000      11160              7         1968            2110           2
## 5    189900      13830              5         1997             928           2
## 6    195500       9978              6         1998             926           2
## # ℹ 2 more variables: `Lot Frontage` <int>, Garage_Treated <dbl>
summary(data_for_mi)
##    SalePrice         Lot Area       Overall Qual      Year Built  
##  Min.   : 12789   Min.   :  1300   Min.   : 1.000   Min.   :1872  
##  1st Qu.:129500   1st Qu.:  7440   1st Qu.: 5.000   1st Qu.:1954  
##  Median :160000   Median :  9436   Median : 6.000   Median :1973  
##  Mean   :180796   Mean   : 10148   Mean   : 6.095   Mean   :1971  
##  3rd Qu.:213500   3rd Qu.: 11555   3rd Qu.: 7.000   3rd Qu.:2001  
##  Max.   :755000   Max.   :215245   Max.   :10.000   Max.   :2010  
##                                                                   
##  Total Bsmt SF    Full Bath      Lot Frontage    Garage_Treated  
##  Min.   :   0   Min.   :0.000   Min.   : 21.00   Min.   :0.0000  
##  1st Qu.: 793   1st Qu.:1.000   1st Qu.: 58.00   1st Qu.:1.0000  
##  Median : 990   Median :2.000   Median : 68.00   Median :1.0000  
##  Mean   :1052   Mean   :1.567   Mean   : 69.22   Mean   :0.9464  
##  3rd Qu.:1302   3rd Qu.:2.000   3rd Qu.: 80.00   3rd Qu.:1.0000  
##  Max.   :6110   Max.   :4.000   Max.   :313.00   Max.   :1.0000  
##  NA's   :1                      NA's   :490      NA's   :1
ps_formula <- Garage_Treated ~ `Lot Area`+ `Overall Qual`+ `Year Built`+ `Total Bsmt SF` + `Full Bath` + `Lot Frontage`

Thực hiện mutiple imputation và chạy PSA bằng hàm “matchthem” trong package “MatchThem” (nhằm trách lỗi)

imputed_data <- mice(data_for_mi, m=5, seed = 123, method = "pmm")
## 
##  iter imp variable
##   1   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   1   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   1   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   1   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   1   5  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   5  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   5  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   5  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   5  Total Bsmt SF  Lot Frontage  Garage_Treated
matched_imputations <- matchthem(ps_formula,
    datasets = imputed_data,
    approach = "within",
    method = "full",
    distance = "glm")
## 
## Matching Observations  | dataset: #1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## #2
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## #3
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## #4
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##  #5
summary(matched_imputations)
## Summarizing            | dataset: #1
## 
## Call:
## matchthem(formula = ps_formula, datasets = imputed_data, approach = "within", 
##     method = "full", distance = "glm")
## 
## Summary of Balance for All Data:
##               Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance             0.9536        0.8188          1.8651     0.2314    0.3458
## Lot Area         10285.6924     7714.5605          0.3198     6.0302    0.1552
## Overall Qual         6.1785        4.6178          1.1303     1.6070    0.1561
## Year Built        1972.8868     1944.3248          0.9695     0.8893    0.2312
## Total Bsmt SF     1066.6628      783.5159          0.6464     1.2092    0.1658
## Full Bath            1.5792        1.3439          0.4268     1.0430    0.0480
## Lot Frontage        70.5957       59.7707          0.4570     1.3241    0.0815
##               eCDF Max
## distance        0.5882
## Lot Area        0.2740
## Overall Qual    0.4821
## Year Built      0.3873
## Total Bsmt SF   0.2947
## Full Bath       0.2338
## Lot Frontage    0.3155
## 
## Summary of Balance for Matched Data:
##               Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance             0.9536        0.9529          0.0101     0.9416    0.0277
## Lot Area         10285.6924     9470.0631          0.1014     4.9649    0.0703
## Overall Qual         6.1785        5.7912          0.2805     1.7831    0.0420
## Year Built        1972.8868     1970.9598          0.0654     0.6013    0.0622
## Total Bsmt SF     1066.6628      974.3260          0.2108     1.8465    0.0683
## Full Bath            1.5792        1.1662          0.7493     1.8915    0.0839
## Lot Frontage        70.5957       66.4769          0.1739     2.5369    0.0604
##               eCDF Max Std. Pair Dist.
## distance        0.2113          0.0204
## Lot Area        0.1787          0.5896
## Overall Qual    0.1551          0.6488
## Year Built      0.2074          0.8067
## Total Bsmt SF   0.2470          0.8709
## Full Bath       0.3974          1.1731
## Lot Frontage    0.2973          0.8694
## 
## Sample Sizes:
##               Control Treated
## All            157.      2773
## Matched (ESS)   17.89    2773
## Matched        157.      2773
## Unmatched        0.         0
## Discarded        0.         0

Kiểm tra sự cân bằng trên dữ liệu MI

love.plot(matched_imputations,
     abs = TRUE,
     threshold = 0.1,
     sample.names = c("Chưa ghép", "Đã ghép"))

Phân tích kết quả và tổng hợp (pooling)

outcome_analysis <- with(matched_imputations, lm(SalePrice ~ Garage_Treated))
final_pool <- pool(outcome_analysis)
summary(final_pool, conf.int = TRUE)
##             term  estimate std.error statistic       df      p.value     2.5 %
## 1    (Intercept) 130064.02  6427.445 20.235728 686.1113 9.293301e-72 117444.20
## 2 Garage_Treated  55031.22  6595.920  8.343221 739.8269 3.531153e-16  42082.27
##      97.5 %  conf.low conf.high
## 1 142683.84 117444.20 142683.84
## 2  67980.17  42082.27  67980.17

Dựa vào love.plot, biến “Full Bath” và “Overall Qual” có SMD > 0.1. Do đó mô hình cân bằng chưa tốt. Thực hiện tiếp Doubly Robust Estimation để điều chỉnh 2 biến trên một lần nữa.

outcome_analysis_robust <- with(matched_imputations, lm(SalePrice ~ Garage_Treated + `Full Bath` + `Overall Qual`))
robust_final <- pool(outcome_analysis_robust)
summary(robust_final, conf.int = TRUE)
##             term   estimate std.error  statistic         df      p.value
## 1    (Intercept) -126074.78 6007.8743 -20.984923   67.28843 3.084281e-31
## 2 Garage_Treated   26777.32 4796.9436   5.582162   34.90586 2.768992e-06
## 3    `Full Bath`   26123.97 1896.4119  13.775475 1557.81977 7.974953e-41
## 4 `Overall Qual`   39351.54  744.7309  52.839947 2906.45949 0.000000e+00
##        2.5 %     97.5 %   conf.low  conf.high
## 1 -138065.60 -114083.95 -138065.60 -114083.95
## 2   17038.06   36516.57   17038.06   36516.57
## 3   22404.19   29843.76   22404.19   29843.76
## 4   37891.29   40811.79   37891.29   40811.79

Phân tích dưới nhóm (subgroup)

subgroup_analysis <- with(matched_imputations, lm(SalePrice ~ Garage_Treated * `Year Built` + `Full Bath` + `Overall Qual`))
summary(pool(subgroup_analysis))
##                          term      estimate   std.error  statistic         df
## 1                 (Intercept)   26205.53653 249110.8477  0.1051963   36.02310
## 2              Garage_Treated -571269.27561 256530.8453 -2.2269029   47.80795
## 3                `Year Built`     -68.81465    126.0366 -0.5459893   36.90131
## 4                 `Full Bath`   22965.41122   1965.7896 11.6825376 1279.22686
## 5              `Overall Qual`   37107.75393    840.0976 44.1707631 2193.05038
## 6 Garage_Treated:`Year Built`     304.31519    129.5870  2.3483459   49.99975
##         p.value
## 1  9.168036e-01
## 2  3.069891e-02
## 3  5.883592e-01
## 4  4.936371e-30
## 5 2.160475e-305
## 6  2.285261e-02
subgroup_categorical <- with(matched_imputations, lm(SalePrice ~ Garage_Treated * cut(`Year Built`, breaks = c(-Inf, 1950, 1990, Inf), labels = c("Pre-1950", "1950-1990", "1990-Post")) + `Full Bath` + `Overall Qual`))
run_subgroup <- function(data_matched, year_min, year_max, label) {
        res_pool <- summary(pool(subgroup_categorical), conf.int = TRUE)
        return(res_pool)}
res <- summary(pool(subgroup_categorical), conf.int = TRUE)
results_forest <- data.frame()
groups <- c("Pre-1950", "1950-1990", "1990-Post")
cuts <- c(-Inf, 1950, 1990, Inf)
for (i in 1:3) {}
  model_forest <- with(matched_imputations,
  lm(SalePrice ~ Garage_Treated : cut(`Year Built`, breaks = c(-Inf, 1950, 1990, Inf)) + cut(`Year Built`, breaks = c(-Inf, 1950, 1990, Inf)) + `Full Bath` + `Overall Qual`))
res_forest <- summary(pool(model_forest), conf.int = TRUE)
plot_data <- res_forest[grep("Garage_Treated:", res_forest$term), ]
plot_data$Group <- c("Pre-1950", "1950-1990", "1990-Post")
ggplot(plot_data, aes(x = estimate, y = Group)) +
  geom_point(size = 3, color = "blue") + # Điểm ước lượng
  geom_errorbarh(aes(xmin = `2.5 %`, xmax = `97.5 %`), height = 0.2) + # Khoảng tin cậy
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") + # Đường 0 (không tác động)
  labs(title = "Tác động của Garage theo Năm xây dựng (Forest Plot)",
  x = "Mức tăng giá nhà (USD)",
  y = "Năm xây dựng") +
  theme_minimal()
## Warning: `geom_errobarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `height` was translated to `width`.

Phân tích E-value (Sensitivity Analysis)

Estimate <- 26777.32
SE <- 4796.9436
d <- 0.335
se <- 0.060
calc_Evalue <- function(RR) {
  if (RR <= 1) return(1)
  return(RR + sqrt(RR * (RR - 1)))
}
RR_estimate <- exp(0.91 * d)
E_point <- calc_Evalue(RR_estimate)
d_lower <- d - (1.96 * se)
RR_lower <- exp(0.91 * d_lower)
E_CI <- calc_Evalue(RR_lower)
cat("--- KẾT QUẢ TÍNH THỦ CÔNG ---\n")
## --- KẾT QUẢ TÍNH THỦ CÔNG ---
cat("E-value (Estimate):", round(E_point, 2), "\n")
## E-value (Estimate): 2.05
cat("E-value (Lower CI):", round(E_CI, 2), "\n")
## E-value (Lower CI): 1.74

Qui trình thực hiện:

Dữ liệu thô -> mice() -> imputed_data

imputed_data -> MatchThem::matchthem() -> matched_imputations

matched_imputations -> with(…, lm(…)) -> outcome_analysis