Matching

install_if_needed <- function(pkg) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}


install_if_needed("tidyverse")

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

install_if_needed("Matching")

## Loading required package: Matching
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## ## 
## ##  Matching (Version 4.10-15, Build Date: 2024-10-14)
## ##  See https://www.jsekhon.com for additional documentation.
## ##  Please cite software as:
## ##   Jasjeet S. Sekhon. 2011. ``Multivariate and Propensity Score Matching
## ##   Software with Automated Balance Optimization: The Matching package for R.''
## ##   Journal of Statistical Software, 42(7): 1-52. 
## ##

install_if_needed("cem")

## Loading required package: cem
## Loading required package: tcltk
## Loading required package: lattice
## 
## How to use CEM? Type vignette("cem")

install_if_needed("ebal")

## Loading required package: ebal
## ##
## ## ebal Package: Implements Entropy Balancing.
## 
## ## See http://www.stanford.edu/~jhain/ for additional information.

install_if_needed("WeightIt")

## Loading required package: WeightIt

install_if_needed("causaldata")

## Loading required package: causaldata

install_if_needed("boot")

## Loading required package: boot
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma

library(Matching); library(tidyverse)
br <- causaldata::black_politicians

# Outcome
Y <- br %>%
    pull(responded)
# Treatment
D <- br %>%
    pull(leg_black)
# Matching variables
# Note select() is also in the Matching package, so we specify dplyr
X <- br %>%
    dplyr::select(medianhhincom, blackpercent, leg_democrat) %>%
    as.matrix()

# Weight = 2, oddly, denotes Mahalanobis distance
M <- Match(Y, D, X, Weight = 2, caliper = 1)

# See treatment effect estimate
summary(M)

## 
## Estimate...  -0.0073462 
## AI SE......  0.072683 
## T-stat.....  -0.10107 
## p.val......  0.91949 
## 
## Original number of observations..............  5593 
## Original number of treated obs...............  364 
## Matched number of observations...............  363 
## Matched number of observations  (unweighted).  405 
## 
## Caliper (SDs)........................................   1 1 1 
## Number of obs dropped by 'exact' or 'caliper'  1

# Get matched data for use elsewhere. Note that this approach will 
# duplicate each observation for each time it was matched
matched_treated <- tibble(id = M$index.treated,
                          weight = M$weights)
matched_control <- tibble(id = M$index.control,
                          weight = M$weights)
matched_sets <- bind_rows(matched_treated,
                          matched_control) 
# Simplify to one row per observation
matched_sets <- matched_sets %>%
                    group_by(id) %>%
                    summarize(weight = sum(weight))
# And bring back to data
matched_br <- br %>%
    mutate(id = row_number()) %>%
    left_join(matched_sets, by = 'id')

# To be used like this! The standard errors here are wrong
lm(responded~leg_black, data = matched_br, weights = weight)

## 
## Call:
## lm(formula = responded ~ leg_black, data = matched_br, weights = weight)
## 
## Coefficients:
## (Intercept)    leg_black  
##    0.398531    -0.007346

library(cem)
library(tidyverse)
br <- causaldata::black_politicians

# Limit to just the relevant variables and omit missings
brcem <- br %>%
    dplyr::select(responded, leg_black, medianhhincom, 
    blackpercent, leg_democrat) %>%
    na.omit() %>%
    as.data.frame() # Must be a data.frame, not a tibble

# Create breaks. Use quantiles to create quantile cuts or manually for 
# evenly spaced (You can also skip this and let it do it automatically,
# although you MUST do it yourself for binary variables). Be sure
# to include the "edges" (max and min values). So! Six bins each:
inc_bins <- quantile(brcem$medianhhincom, (0:6)/6)

create_even_breaks <- function(x, n) {
    minx <- min(x)
    maxx <- max(x)
    
    return(minx + ((0:n)/n)*(maxx-minx))
}

bp_bins <- create_even_breaks(brcem$blackpercent, 6)

# For binary, we specifically need two even bins
ld_bins <- create_even_breaks(brcem$leg_democrat,2)

# Make a list of bins
allbreaks <- list('medianhhincom' = inc_bins,
                  'blackpercent' = bp_bins,
                  'leg_democrat' = ld_bins)

# Match, being sure not to match on the outcome
# Note the baseline.group is the *treated* group
c <- cem(treatment = 'leg_black', data = brcem,
         baseline.group =  '1',
         drop = 'responded',
         cutpoints = allbreaks,
         keep.all = TRUE)

## 
## Using 'leg_black'='1' as baseline group

# Get weights for other purposes
brcem <- brcem %>%
    mutate(cem_weight = c$w)
lm(responded~leg_black, data = brcem, weights = cem_weight)

## 
## Call:
## lm(formula = responded ~ leg_black, data = brcem, weights = cem_weight)
## 
## Coefficients:
## (Intercept)    leg_black  
##     0.34680      0.02302

# Or use their estimation function att. Note there are many options 
# for these functions including logit or machine-learing treatment 
# estimation. Read the docs!
att(c, responded ~ leg_black, data = brcem)

## 
##             G0  G1
## All       5229 364
## Matched   4491 338
## Unmatched  738  26
## 
## Linear regression model on CEM matched data:
## 
## SATT point estimate: 0.023020 (p.value=0.391783)
## 95% conf. interval: [-0.029659, 0.075699]

library(ebal); library(tidyverse); library(modelsummary)
br <- causaldata::black_politicians

# Outcome
Y <- br %>%
    pull(responded)
# Treatment
D <- br %>%
    pull(leg_black)
# Matching variables
X <- br %>%
    dplyr::select(medianhhincom, blackpercent, leg_democrat) %>%
    # Add square terms to match variances if we like
    mutate(incsq = medianhhincom^2,
    bpsq = blackpercent^2) %>%
    as.matrix()

eb <- ebalance(D, X)

## Converged within tolerance

# Get weights for usage elsewhere
# Noting that this contains only control weights
br_treat <- br %>%
    filter(leg_black == 1) %>%
    mutate(weights = 1)
br_con <- br %>%
    filter(leg_black == 0) %>%
    mutate(weights = eb$w)
br <- bind_rows(br_treat, br_con)

m <- lm(responded ~ leg_black, data = br, weights = weights)
msummary(m, stars = c('*' = .1, '**' = .05, '***' = .01))

	(1)
* p < 0.1, p < 0.05, * p < 0.01
(Intercept)	0.302***
	(0.009)
leg_black	0.091***
	(0.013)
Num.Obs.	5593
R2	0.009
R2 Adj.	0.009
AIC	24483.2
BIC	24503.1
Log.Lik.	-12238.623
F	51.253
RMSE	0.51

library(tidyverse)
library(WeightIt)
library(survey)

## Loading required package: grid

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:boot':
## 
##     aml

## 
## Attaching package: 'survey'

## The following object is masked from 'package:WeightIt':
## 
##     calibrate

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(causaldata) 

# 数据准备
br <- black_politicians

# 去除缺失值（确保拟合不会出错）
br <- br %>%
  drop_na(medianhhincom, blackpercent, leg_democrat, responded, leg_black)

# 使用 WeightIt 估计倾向得分并生成 IPW 权重（logit模型）
w.out <- weightit(leg_black ~ medianhhincom + blackpercent + leg_democrat,
                  data = br, method = "ps", estimand = "ATE")

## Warning: Some extreme weights were generated. Examine them with `summary()` and
## maybe trim them with `trim()`.

# 查看权重分布（可选）
summary(w.out)

##                   Summary of weights
## 
## - Weight ranges:
## 
##            Min                                    Max
## treated 1.0035 |---------------------------| 777.6755
## control 1.0006 ||                             37.5934
## 
## - Units with the 5 most extreme weights by group:
##                                                   
##              2117    3538     2107   3570     4047
##  treated 484.5545 668.326 684.4767 753.16 777.6755
##              2585    2609     4641   1808     2607
##  control   19.275  20.252  21.1548 37.303  37.5934
## 
## - Weight statistics:
## 
##         Coef of Var   MAD Entropy # Zeros
## treated       5.065 1.545   2.415       0
## control       0.950 0.140   0.110       0
## 
## - Effective Sample Sizes:
## 
##            Control Treated
## Unweighted 5229.    364.  
## Weighted   2749.12   13.69

# 把权重加入原始数据中
br$ipw <- w.out$weights

# 用 survey 包进行加权回归（带 robust 标准误）
design <- svydesign(ids = ~1, weights = ~ipw, data = br)

model <- svyglm(responded ~ leg_black, design = design)

summary(model)

## 
## Call:
## svyglm(formula = responded ~ leg_black, design = design)
## 
## Survey design:
## svydesign(ids = ~1, weights = ~ipw, data = br)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.410037   0.008808  46.554   <2e-16 ***
## leg_black   0.149895   0.129413   1.158    0.247    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2442293)
## 
## Number of Fisher Scoring iterations: 2

library(tidyverse)
br <- causaldata::black_politicians

# We can estimate our own propensity score
m <- glm(leg_black ~ medianhhincom + blackpercent + leg_democrat,
    data = br, family = binomial(link = 'logit'))
# Get predicted values
br <- br %>%
    mutate(ps = predict(m, type = 'response'))

# Create IPW weights
br <- br %>%
    mutate(ipw = case_when(
    leg_black == 1 ~ 1/ps,
    leg_black == 0 ~ 1/(1-ps)))

# Density plots for raw data
ggplot(br, aes(x = medianhhincom, color = factor(leg_black))) + 
    geom_density()

# And using our matching weights
ggplot(br, aes(x = medianhhincom, color = factor(leg_black),
    weight = ipw)) + geom_density()

library(cem); library(tidyverse); library(modelsummary)
br <- causaldata::black_politicians

# This copies the CEM code from the CEM section
# See that section's code for comments and notes

# Limit to just the relevant variables and omit missings
# (of which there are none in this data)
brcem <- br %>%
    dplyr::select(responded, leg_black, medianhhincom, 
    blackpercent, leg_democrat) %>%
    na.omit() %>%
    as.data.frame() 

# Create breaks
inc_bins <- quantile(brcem$medianhhincom, (0:6)/6)

create_even_breaks <- function(x, n) {
    minx <- min(x)
    maxx <- max(x)
    return(minx + ((0:n)/n)*(maxx-minx))
}

bp_bins <- create_even_breaks(brcem$blackpercent, 6)
ld_bins <- create_even_breaks(brcem$leg_democrat,2)

allbreaks <- list('medianhhincom' = inc_bins,
    'blackpercent' = bp_bins,
    'leg_democrat' = ld_bins)

c <- cem(treatment = 'leg_black', data = brcem,
    baseline.group =  '1', drop = 'responded',
    cutpoints = allbreaks, keep.all = TRUE)

## 
## Using 'leg_black'='1' as baseline group

# Get weights for other purposes.  Note this exact code only 
# works because we didn't have to drop any NAs. If we did, 
# lining things up would be trickier
br <- br %>%
    mutate(cem_weight = c$w)
m1 <- lm(responded~leg_black*treat_out + nonblacknonwhite +
    black_medianhh + white_medianhh + statessquireindex + 
    totalpop + urbanpercent, data = br, weights = cem_weight)
msummary(m1, stars = c('*' = .1, '**' = .05, '***' = .01))

	(1)
* p < 0.1, p < 0.05, * p < 0.01
(Intercept)	0.561***
	(0.027)
leg_black	-0.142***
	(0.035)
treat_out	-0.351***
	(0.013)
nonblacknonwhite	0.089**
	(0.037)
black_medianhh	0.191***
	(0.017)
white_medianhh	-0.169***
	(0.015)
statessquireindex	-0.516***
	(0.087)
totalpop	0.013***
	(0.001)
urbanpercent	0.123***
	(0.024)
leg_black × treat_out	0.229***
	(0.048)
Num.Obs.	4829
R2	0.192
R2 Adj.	0.191
AIC	15973.7
BIC	16045.0
Log.Lik.	-7975.827
F	127.431
RMSE	0.53

plot(m1)

library(boot); library(tidyverse)
br <- causaldata::black_politicians

# Function to do IPW estimation with regression adjustment
ipwra <- function(br, index = 1:nrow(br)) {
    # Apply bootstrap index
    br <- br %>% slice(index)
    
    # estimate and predict propensity score
    m <- glm(leg_black ~ medianhhincom + blackpercent + leg_democrat,
             data = br, family = binomial(link = 'logit'))
    br <- br %>%
        mutate(ps = predict(m, type = 'response'))
    
    # Trim control observations outside of treated PS range
    minps <- br %>%
        filter(leg_black == 1) %>%
        pull(ps) %>%
        min(na.rm = TRUE)
    maxps <- br %>%
        filter(leg_black == 1) %>%
        pull(ps) %>%
        max(na.rm = TRUE)
    br <- br %>%
        filter(ps >= minps & ps <= maxps)
    
    # Create IPW weights
    br <- br %>%
        mutate(ipw = case_when(
        leg_black == 1 ~ 1/ps,
        leg_black == 0 ~ 1/(1-ps)))
    
    # Estimate difference
    w_means <- br %>% 
        group_by(leg_black) %>%
        summarize(m = weighted.mean(responded, w = ipw)) %>%
        arrange(leg_black)
    
    return(w_means$m[2] - w_means$m[1])
}


b <- boot(br, ipwra, R = 200)
# See estimate and standard error
b

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = br, statistic = ipwra, R = 200)
## 
## 
## Bootstrap Statistics :
##      original      bias    std. error
## t1* 0.1563906 0.007494783   0.1389994

install.packages("Matching")

## Warning: package 'Matching' is in use and will not be installed

library(Matching); library(tidyverse)

br <- causaldata::black_politicians

# Outcome
Y <- br %>%
  pull(responded)
# Treatment
D <- br %>%
  pull(leg_black)
# Matching variables
# Note select() is also in the Matching package, so we specify dplyr
X <- br %>%
  dplyr::select(medianhhincom, blackpercent, leg_democrat) %>%
  as.matrix()

# Weight = 2, oddly, denotes Mahalanobis distance
M <- Match(Y, D, X, Weight = 2, caliper = 1)

# See treatment effect estimate
summary(M)

## 
## Estimate...  -0.0073462 
## AI SE......  0.072683 
## T-stat.....  -0.10107 
## p.val......  0.91949 
## 
## Original number of observations..............  5593 
## Original number of treated obs...............  364 
## Matched number of observations...............  363 
## Matched number of observations  (unweighted).  405 
## 
## Caliper (SDs)........................................   1 1 1 
## Number of obs dropped by 'exact' or 'caliper'  1

# Get matched data for use elsewhere. Note that this approach will 
# duplicate each observation for each time it was matched
matched_treated <- tibble(id = M$index.treated,
                          weight = M$weights)
matched_control <- tibble(id = M$index.control,
                          weight = M$weights)
matched_sets <- bind_rows(matched_treated,
                          matched_control) 
# Simplify to one row per observation
matched_sets <- matched_sets %>%
  group_by(id) %>%
  summarize(weight = sum(weight))
# And bring back to data
matched_br <- br %>%
  mutate(id = row_number()) %>%
  left_join(matched_sets, by = 'id')

# To be used like this! The standard errors here are wrong
p<-lm(responded~leg_black, data = matched_br, weights = weight)
p

## 
## Call:
## lm(formula = responded ~ leg_black, data = matched_br, weights = weight)
## 
## Coefficients:
## (Intercept)    leg_black  
##    0.398531    -0.007346

plot(p)

Matching

“Junhe Yin”

2025-05-25