Nhập dữ liệu Ameshousing và chạy phân tích

library(AmesHousing)
data("ames_raw")
head(ames_raw)

## # A tibble: 6 × 82
##   Order PID     `MS SubClass` `MS Zoning` `Lot Frontage` `Lot Area` Street Alley
##   <int> <chr>   <chr>         <chr>                <int>      <int> <chr>  <chr>
## 1     1 052630… 020           RL                     141      31770 Pave   <NA> 
## 2     2 052635… 020           RH                      80      11622 Pave   <NA> 
## 3     3 052635… 020           RL                      81      14267 Pave   <NA> 
## 4     4 052635… 020           RL                      93      11160 Pave   <NA> 
## 5     5 052710… 060           RL                      74      13830 Pave   <NA> 
## 6     6 052710… 060           RL                      78       9978 Pave   <NA> 
## # ℹ 74 more variables: `Lot Shape` <chr>, `Land Contour` <chr>,
## #   Utilities <chr>, `Lot Config` <chr>, `Land Slope` <chr>,
## #   Neighborhood <chr>, `Condition 1` <chr>, `Condition 2` <chr>,
## #   `Bldg Type` <chr>, `House Style` <chr>, `Overall Qual` <int>,
## #   `Overall Cond` <int>, `Year Built` <int>, `Year Remod/Add` <int>,
## #   `Roof Style` <chr>, `Roof Matl` <chr>, `Exterior 1st` <chr>,
## #   `Exterior 2nd` <chr>, `Mas Vnr Type` <chr>, `Mas Vnr Area` <int>, …

library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(MatchIt)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(EValue)
library(miceadds)

## * miceadds 3.18-36 (2025-09-12 09:54:54)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(MatchThem)

## 
## Attaching package: 'MatchThem'
## 
## The following objects are masked from 'package:mice':
## 
##     cbind, pool
## 
## The following object is masked from 'package:base':
## 
##     cbind

library(cobalt)

##  cobalt (Version 4.6.1, Build Date: 2025-08-20)
## 
## Attaching package: 'cobalt'
## 
## The following object is masked from 'package:MatchIt':
## 
##     lalonde

library(broom)
attach(ames_raw)

Chọn biến liên quan đưa vào phân tích

data_for_mi <- ames_raw[, c("SalePrice", "Garage Cars", "Lot Area", "Overall Qual", "Year Built", "Total Bsmt SF", "Full Bath", "Lot Frontage")]
data_for_mi$Garage_Treated <- as.numeric(data_for_mi$`Garage Cars` > 0)
data_for_mi <- data_for_mi %>%
  select(-`Garage Cars`)
head(data_for_mi)

## # A tibble: 6 × 8
##   SalePrice `Lot Area` `Overall Qual` `Year Built` `Total Bsmt SF` `Full Bath`
##       <int>      <int>          <int>        <int>           <int>       <int>
## 1    215000      31770              6         1960            1080           1
## 2    105000      11622              5         1961             882           1
## 3    172000      14267              6         1958            1329           1
## 4    244000      11160              7         1968            2110           2
## 5    189900      13830              5         1997             928           2
## 6    195500       9978              6         1998             926           2
## # ℹ 2 more variables: `Lot Frontage` <int>, Garage_Treated <dbl>

summary(data_for_mi)

##    SalePrice         Lot Area       Overall Qual      Year Built  
##  Min.   : 12789   Min.   :  1300   Min.   : 1.000   Min.   :1872  
##  1st Qu.:129500   1st Qu.:  7440   1st Qu.: 5.000   1st Qu.:1954  
##  Median :160000   Median :  9436   Median : 6.000   Median :1973  
##  Mean   :180796   Mean   : 10148   Mean   : 6.095   Mean   :1971  
##  3rd Qu.:213500   3rd Qu.: 11555   3rd Qu.: 7.000   3rd Qu.:2001  
##  Max.   :755000   Max.   :215245   Max.   :10.000   Max.   :2010  
##                                                                   
##  Total Bsmt SF    Full Bath      Lot Frontage    Garage_Treated  
##  Min.   :   0   Min.   :0.000   Min.   : 21.00   Min.   :0.0000  
##  1st Qu.: 793   1st Qu.:1.000   1st Qu.: 58.00   1st Qu.:1.0000  
##  Median : 990   Median :2.000   Median : 68.00   Median :1.0000  
##  Mean   :1052   Mean   :1.567   Mean   : 69.22   Mean   :0.9464  
##  3rd Qu.:1302   3rd Qu.:2.000   3rd Qu.: 80.00   3rd Qu.:1.0000  
##  Max.   :6110   Max.   :4.000   Max.   :313.00   Max.   :1.0000  
##  NA's   :1                      NA's   :490      NA's   :1

ps_formula <- Garage_Treated ~ `Lot Area`+ `Overall Qual`+ `Year Built`+ `Total Bsmt SF` + `Full Bath` + `Lot Frontage`

Thực hiện mutiple imputation và chạy PSA bằng hàm “matchthem” trong package “MatchThem” (nhằm trách lỗi)

imputed_data <- mice(data_for_mi, m=5, seed = 123, method = "pmm")

## 
##  iter imp variable
##   1   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   1   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   1   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   1   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   1   5  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   2   5  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   3   5  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   4   5  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   1  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   2  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   3  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   4  Total Bsmt SF  Lot Frontage  Garage_Treated
##   5   5  Total Bsmt SF  Lot Frontage  Garage_Treated

matched_imputations <- matchthem(ps_formula,
    datasets = imputed_data,
    approach = "within",
    method = "full",
    distance = "glm")

## 
## Matching Observations  | dataset: #1

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## #2

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## #3

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## #4

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

##  #5

summary(matched_imputations)

## Summarizing            | dataset: #1

## 
## Call:
## matchthem(formula = ps_formula, datasets = imputed_data, approach = "within", 
##     method = "full", distance = "glm")
## 
## Summary of Balance for All Data:
##               Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance             0.9536        0.8188          1.8651     0.2314    0.3458
## Lot Area         10285.6924     7714.5605          0.3198     6.0302    0.1552
## Overall Qual         6.1785        4.6178          1.1303     1.6070    0.1561
## Year Built        1972.8868     1944.3248          0.9695     0.8893    0.2312
## Total Bsmt SF     1066.6628      783.5159          0.6464     1.2092    0.1658
## Full Bath            1.5792        1.3439          0.4268     1.0430    0.0480
## Lot Frontage        70.5957       59.7707          0.4570     1.3241    0.0815
##               eCDF Max
## distance        0.5882
## Lot Area        0.2740
## Overall Qual    0.4821
## Year Built      0.3873
## Total Bsmt SF   0.2947
## Full Bath       0.2338
## Lot Frontage    0.3155
## 
## Summary of Balance for Matched Data:
##               Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance             0.9536        0.9529          0.0101     0.9416    0.0277
## Lot Area         10285.6924     9470.0631          0.1014     4.9649    0.0703
## Overall Qual         6.1785        5.7912          0.2805     1.7831    0.0420
## Year Built        1972.8868     1970.9598          0.0654     0.6013    0.0622
## Total Bsmt SF     1066.6628      974.3260          0.2108     1.8465    0.0683
## Full Bath            1.5792        1.1662          0.7493     1.8915    0.0839
## Lot Frontage        70.5957       66.4769          0.1739     2.5369    0.0604
##               eCDF Max Std. Pair Dist.
## distance        0.2113          0.0204
## Lot Area        0.1787          0.5896
## Overall Qual    0.1551          0.6488
## Year Built      0.2074          0.8067
## Total Bsmt SF   0.2470          0.8709
## Full Bath       0.3974          1.1731
## Lot Frontage    0.2973          0.8694
## 
## Sample Sizes:
##               Control Treated
## All            157.      2773
## Matched (ESS)   17.89    2773
## Matched        157.      2773
## Unmatched        0.         0
## Discarded        0.         0

Kiểm tra sự cân bằng trên dữ liệu MI

love.plot(matched_imputations,
     abs = TRUE,
     threshold = 0.1,
     sample.names = c("Chưa ghép", "Đã ghép"))

Phân tích kết quả và tổng hợp (pooling)

outcome_analysis <- with(matched_imputations, lm(SalePrice ~ Garage_Treated))
final_pool <- pool(outcome_analysis)
summary(final_pool, conf.int = TRUE)

##             term  estimate std.error statistic       df      p.value     2.5 %
## 1    (Intercept) 130064.02  6427.445 20.235728 686.1113 9.293301e-72 117444.20
## 2 Garage_Treated  55031.22  6595.920  8.343221 739.8269 3.531153e-16  42082.27
##      97.5 %  conf.low conf.high
## 1 142683.84 117444.20 142683.84
## 2  67980.17  42082.27  67980.17

Dựa vào love.plot, biến “Full Bath” và “Overall Qual” có SMD > 0.1. Do đó mô hình cân bằng chưa tốt. Thực hiện tiếp Doubly Robust Estimation để điều chỉnh 2 biến trên một lần nữa.

outcome_analysis_robust <- with(matched_imputations, lm(SalePrice ~ Garage_Treated + `Full Bath` + `Overall Qual`))
robust_final <- pool(outcome_analysis_robust)
summary(robust_final, conf.int = TRUE)

##             term   estimate std.error  statistic         df      p.value
## 1    (Intercept) -126074.78 6007.8743 -20.984923   67.28843 3.084281e-31
## 2 Garage_Treated   26777.32 4796.9436   5.582162   34.90586 2.768992e-06
## 3    `Full Bath`   26123.97 1896.4119  13.775475 1557.81977 7.974953e-41
## 4 `Overall Qual`   39351.54  744.7309  52.839947 2906.45949 0.000000e+00
##        2.5 %     97.5 %   conf.low  conf.high
## 1 -138065.60 -114083.95 -138065.60 -114083.95
## 2   17038.06   36516.57   17038.06   36516.57
## 3   22404.19   29843.76   22404.19   29843.76
## 4   37891.29   40811.79   37891.29   40811.79

Phân tích dưới nhóm (subgroup)

subgroup_analysis <- with(matched_imputations, lm(SalePrice ~ Garage_Treated * `Year Built` + `Full Bath` + `Overall Qual`))
summary(pool(subgroup_analysis))

##                          term      estimate   std.error  statistic         df
## 1                 (Intercept)   26205.53653 249110.8477  0.1051963   36.02310
## 2              Garage_Treated -571269.27561 256530.8453 -2.2269029   47.80795
## 3                `Year Built`     -68.81465    126.0366 -0.5459893   36.90131
## 4                 `Full Bath`   22965.41122   1965.7896 11.6825376 1279.22686
## 5              `Overall Qual`   37107.75393    840.0976 44.1707631 2193.05038
## 6 Garage_Treated:`Year Built`     304.31519    129.5870  2.3483459   49.99975
##         p.value
## 1  9.168036e-01
## 2  3.069891e-02
## 3  5.883592e-01
## 4  4.936371e-30
## 5 2.160475e-305
## 6  2.285261e-02

subgroup_categorical <- with(matched_imputations, lm(SalePrice ~ Garage_Treated * cut(`Year Built`, breaks = c(-Inf, 1950, 1990, Inf), labels = c("Pre-1950", "1950-1990", "1990-Post")) + `Full Bath` + `Overall Qual`))
run_subgroup <- function(data_matched, year_min, year_max, label) {
        res_pool <- summary(pool(subgroup_categorical), conf.int = TRUE)
        return(res_pool)}
res <- summary(pool(subgroup_categorical), conf.int = TRUE)
results_forest <- data.frame()
groups <- c("Pre-1950", "1950-1990", "1990-Post")
cuts <- c(-Inf, 1950, 1990, Inf)
for (i in 1:3) {}
  model_forest <- with(matched_imputations,
  lm(SalePrice ~ Garage_Treated : cut(`Year Built`, breaks = c(-Inf, 1950, 1990, Inf)) + cut(`Year Built`, breaks = c(-Inf, 1950, 1990, Inf)) + `Full Bath` + `Overall Qual`))
res_forest <- summary(pool(model_forest), conf.int = TRUE)
plot_data <- res_forest[grep("Garage_Treated:", res_forest$term), ]
plot_data$Group <- c("Pre-1950", "1950-1990", "1990-Post")
ggplot(plot_data, aes(x = estimate, y = Group)) +
  geom_point(size = 3, color = "blue") + # Điểm ước lượng
  geom_errorbarh(aes(xmin = `2.5 %`, xmax = `97.5 %`), height = 0.2) + # Khoảng tin cậy
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") + # Đường 0 (không tác động)
  labs(title = "Tác động của Garage theo Năm xây dựng (Forest Plot)",
  x = "Mức tăng giá nhà (USD)",
  y = "Năm xây dựng") +
  theme_minimal()

## Warning: `geom_errobarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `height` was translated to `width`.

Phân tích E-value (Sensitivity Analysis)

Estimate <- 26777.32
SE <- 4796.9436
d <- 0.335
se <- 0.060
calc_Evalue <- function(RR) {
  if (RR <= 1) return(1)
  return(RR + sqrt(RR * (RR - 1)))
}
RR_estimate <- exp(0.91 * d)
E_point <- calc_Evalue(RR_estimate)
d_lower <- d - (1.96 * se)
RR_lower <- exp(0.91 * d_lower)
E_CI <- calc_Evalue(RR_lower)
cat("--- KẾT QUẢ TÍNH THỦ CÔNG ---\n")

## --- KẾT QUẢ TÍNH THỦ CÔNG ---

cat("E-value (Estimate):", round(E_point, 2), "\n")

## E-value (Estimate): 2.05

cat("E-value (Lower CI):", round(E_CI, 2), "\n")

## E-value (Lower CI): 1.74

Phân tích dữ liệu Ameshousing sử dụng Propensity Score Full matching kết hợp Multiple Imputation

Trần Dương Đăng Khoa

2025-11-20

Nhập dữ liệu Ameshousing và chạy phân tích

Chọn biến liên quan đưa vào phân tích

Thực hiện mutiple imputation và chạy PSA bằng hàm “matchthem” trong package “MatchThem” (nhằm trách lỗi)

Kiểm tra sự cân bằng trên dữ liệu MI

Phân tích kết quả và tổng hợp (pooling)

Dựa vào love.plot, biến “Full Bath” và “Overall Qual” có SMD > 0.1. Do đó mô hình cân bằng chưa tốt. Thực hiện tiếp Doubly Robust Estimation để điều chỉnh 2 biến trên một lần nữa.

Phân tích dưới nhóm (subgroup)

Phân tích E-value (Sensitivity Analysis)

Qui trình thực hiện:

Dữ liệu thô -> mice() -> imputed_data

imputed_data -> MatchThem::matchthem() -> matched_imputations

matched_imputations -> with(…, lm(…)) -> outcome_analysis