install_if_needed <- function(pkg) {
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg)
library(pkg, character.only = TRUE)
}
}
install_if_needed("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install_if_needed("Matching")
## Loading required package: Matching
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## ##
## ## Matching (Version 4.10-15, Build Date: 2024-10-14)
## ## See https://www.jsekhon.com for additional documentation.
## ## Please cite software as:
## ## Jasjeet S. Sekhon. 2011. ``Multivariate and Propensity Score Matching
## ## Software with Automated Balance Optimization: The Matching package for R.''
## ## Journal of Statistical Software, 42(7): 1-52.
## ##
install_if_needed("cem")
## Loading required package: cem
## Loading required package: tcltk
## Loading required package: lattice
##
## How to use CEM? Type vignette("cem")
install_if_needed("ebal")
## Loading required package: ebal
## ##
## ## ebal Package: Implements Entropy Balancing.
##
## ## See http://www.stanford.edu/~jhain/ for additional information.
install_if_needed("WeightIt")
## Loading required package: WeightIt
install_if_needed("causaldata")
## Loading required package: causaldata
install_if_needed("boot")
## Loading required package: boot
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:lattice':
##
## melanoma
library(Matching); library(tidyverse)
br <- causaldata::black_politicians
# Outcome
Y <- br %>%
pull(responded)
# Treatment
D <- br %>%
pull(leg_black)
# Matching variables
# Note select() is also in the Matching package, so we specify dplyr
X <- br %>%
dplyr::select(medianhhincom, blackpercent, leg_democrat) %>%
as.matrix()
# Weight = 2, oddly, denotes Mahalanobis distance
M <- Match(Y, D, X, Weight = 2, caliper = 1)
# See treatment effect estimate
summary(M)
##
## Estimate... -0.0073462
## AI SE...... 0.072683
## T-stat..... -0.10107
## p.val...... 0.91949
##
## Original number of observations.............. 5593
## Original number of treated obs............... 364
## Matched number of observations............... 363
## Matched number of observations (unweighted). 405
##
## Caliper (SDs)........................................ 1 1 1
## Number of obs dropped by 'exact' or 'caliper' 1
# Get matched data for use elsewhere. Note that this approach will
# duplicate each observation for each time it was matched
matched_treated <- tibble(id = M$index.treated,
weight = M$weights)
matched_control <- tibble(id = M$index.control,
weight = M$weights)
matched_sets <- bind_rows(matched_treated,
matched_control)
# Simplify to one row per observation
matched_sets <- matched_sets %>%
group_by(id) %>%
summarize(weight = sum(weight))
# And bring back to data
matched_br <- br %>%
mutate(id = row_number()) %>%
left_join(matched_sets, by = 'id')
# To be used like this! The standard errors here are wrong
lm(responded~leg_black, data = matched_br, weights = weight)
##
## Call:
## lm(formula = responded ~ leg_black, data = matched_br, weights = weight)
##
## Coefficients:
## (Intercept) leg_black
## 0.398531 -0.007346
library(cem)
library(tidyverse)
br <- causaldata::black_politicians
# Limit to just the relevant variables and omit missings
brcem <- br %>%
dplyr::select(responded, leg_black, medianhhincom,
blackpercent, leg_democrat) %>%
na.omit() %>%
as.data.frame() # Must be a data.frame, not a tibble
# Create breaks. Use quantiles to create quantile cuts or manually for
# evenly spaced (You can also skip this and let it do it automatically,
# although you MUST do it yourself for binary variables). Be sure
# to include the "edges" (max and min values). So! Six bins each:
inc_bins <- quantile(brcem$medianhhincom, (0:6)/6)
create_even_breaks <- function(x, n) {
minx <- min(x)
maxx <- max(x)
return(minx + ((0:n)/n)*(maxx-minx))
}
bp_bins <- create_even_breaks(brcem$blackpercent, 6)
# For binary, we specifically need two even bins
ld_bins <- create_even_breaks(brcem$leg_democrat,2)
# Make a list of bins
allbreaks <- list('medianhhincom' = inc_bins,
'blackpercent' = bp_bins,
'leg_democrat' = ld_bins)
# Match, being sure not to match on the outcome
# Note the baseline.group is the *treated* group
c <- cem(treatment = 'leg_black', data = brcem,
baseline.group = '1',
drop = 'responded',
cutpoints = allbreaks,
keep.all = TRUE)
##
## Using 'leg_black'='1' as baseline group
# Get weights for other purposes
brcem <- brcem %>%
mutate(cem_weight = c$w)
lm(responded~leg_black, data = brcem, weights = cem_weight)
##
## Call:
## lm(formula = responded ~ leg_black, data = brcem, weights = cem_weight)
##
## Coefficients:
## (Intercept) leg_black
## 0.34680 0.02302
# Or use their estimation function att. Note there are many options
# for these functions including logit or machine-learing treatment
# estimation. Read the docs!
att(c, responded ~ leg_black, data = brcem)
##
## G0 G1
## All 5229 364
## Matched 4491 338
## Unmatched 738 26
##
## Linear regression model on CEM matched data:
##
## SATT point estimate: 0.023020 (p.value=0.391783)
## 95% conf. interval: [-0.029659, 0.075699]
library(ebal); library(tidyverse); library(modelsummary)
br <- causaldata::black_politicians
# Outcome
Y <- br %>%
pull(responded)
# Treatment
D <- br %>%
pull(leg_black)
# Matching variables
X <- br %>%
dplyr::select(medianhhincom, blackpercent, leg_democrat) %>%
# Add square terms to match variances if we like
mutate(incsq = medianhhincom^2,
bpsq = blackpercent^2) %>%
as.matrix()
eb <- ebalance(D, X)
## Converged within tolerance
# Get weights for usage elsewhere
# Noting that this contains only control weights
br_treat <- br %>%
filter(leg_black == 1) %>%
mutate(weights = 1)
br_con <- br %>%
filter(leg_black == 0) %>%
mutate(weights = eb$w)
br <- bind_rows(br_treat, br_con)
m <- lm(responded ~ leg_black, data = br, weights = weights)
msummary(m, stars = c('*' = .1, '**' = .05, '***' = .01))
| |
(1) |
| * p < 0.1, ** p < 0.05, *** p < 0.01 |
| (Intercept) |
0.302*** |
|
(0.009) |
| leg_black |
0.091*** |
|
(0.013) |
| Num.Obs. |
5593 |
| R2 |
0.009 |
| R2 Adj. |
0.009 |
| AIC |
24483.2 |
| BIC |
24503.1 |
| Log.Lik. |
-12238.623 |
| F |
51.253 |
| RMSE |
0.51 |
library(tidyverse)
library(WeightIt)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
##
## Attaching package: 'survey'
## The following object is masked from 'package:WeightIt':
##
## calibrate
## The following object is masked from 'package:graphics':
##
## dotchart
library(causaldata)
# 数据准备
br <- black_politicians
# 去除缺失值(确保拟合不会出错)
br <- br %>%
drop_na(medianhhincom, blackpercent, leg_democrat, responded, leg_black)
# 使用 WeightIt 估计倾向得分并生成 IPW 权重(logit模型)
w.out <- weightit(leg_black ~ medianhhincom + blackpercent + leg_democrat,
data = br, method = "ps", estimand = "ATE")
## Warning: Some extreme weights were generated. Examine them with `summary()` and
## maybe trim them with `trim()`.
# 查看权重分布(可选)
summary(w.out)
## Summary of weights
##
## - Weight ranges:
##
## Min Max
## treated 1.0035 |---------------------------| 777.6755
## control 1.0006 || 37.5934
##
## - Units with the 5 most extreme weights by group:
##
## 2117 3538 2107 3570 4047
## treated 484.5545 668.326 684.4767 753.16 777.6755
## 2585 2609 4641 1808 2607
## control 19.275 20.252 21.1548 37.303 37.5934
##
## - Weight statistics:
##
## Coef of Var MAD Entropy # Zeros
## treated 5.065 1.545 2.415 0
## control 0.950 0.140 0.110 0
##
## - Effective Sample Sizes:
##
## Control Treated
## Unweighted 5229. 364.
## Weighted 2749.12 13.69
# 把权重加入原始数据中
br$ipw <- w.out$weights
# 用 survey 包进行加权回归(带 robust 标准误)
design <- svydesign(ids = ~1, weights = ~ipw, data = br)
model <- svyglm(responded ~ leg_black, design = design)
summary(model)
##
## Call:
## svyglm(formula = responded ~ leg_black, design = design)
##
## Survey design:
## svydesign(ids = ~1, weights = ~ipw, data = br)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.410037 0.008808 46.554 <2e-16 ***
## leg_black 0.149895 0.129413 1.158 0.247
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2442293)
##
## Number of Fisher Scoring iterations: 2
library(tidyverse)
br <- causaldata::black_politicians
# We can estimate our own propensity score
m <- glm(leg_black ~ medianhhincom + blackpercent + leg_democrat,
data = br, family = binomial(link = 'logit'))
# Get predicted values
br <- br %>%
mutate(ps = predict(m, type = 'response'))
# Create IPW weights
br <- br %>%
mutate(ipw = case_when(
leg_black == 1 ~ 1/ps,
leg_black == 0 ~ 1/(1-ps)))
# Density plots for raw data
ggplot(br, aes(x = medianhhincom, color = factor(leg_black))) +
geom_density()

# And using our matching weights
ggplot(br, aes(x = medianhhincom, color = factor(leg_black),
weight = ipw)) + geom_density()

library(cem); library(tidyverse); library(modelsummary)
br <- causaldata::black_politicians
# This copies the CEM code from the CEM section
# See that section's code for comments and notes
# Limit to just the relevant variables and omit missings
# (of which there are none in this data)
brcem <- br %>%
dplyr::select(responded, leg_black, medianhhincom,
blackpercent, leg_democrat) %>%
na.omit() %>%
as.data.frame()
# Create breaks
inc_bins <- quantile(brcem$medianhhincom, (0:6)/6)
create_even_breaks <- function(x, n) {
minx <- min(x)
maxx <- max(x)
return(minx + ((0:n)/n)*(maxx-minx))
}
bp_bins <- create_even_breaks(brcem$blackpercent, 6)
ld_bins <- create_even_breaks(brcem$leg_democrat,2)
allbreaks <- list('medianhhincom' = inc_bins,
'blackpercent' = bp_bins,
'leg_democrat' = ld_bins)
c <- cem(treatment = 'leg_black', data = brcem,
baseline.group = '1', drop = 'responded',
cutpoints = allbreaks, keep.all = TRUE)
##
## Using 'leg_black'='1' as baseline group
# Get weights for other purposes. Note this exact code only
# works because we didn't have to drop any NAs. If we did,
# lining things up would be trickier
br <- br %>%
mutate(cem_weight = c$w)
m1 <- lm(responded~leg_black*treat_out + nonblacknonwhite +
black_medianhh + white_medianhh + statessquireindex +
totalpop + urbanpercent, data = br, weights = cem_weight)
msummary(m1, stars = c('*' = .1, '**' = .05, '***' = .01))
| |
(1) |
| * p < 0.1, ** p < 0.05, *** p < 0.01 |
| (Intercept) |
0.561*** |
|
(0.027) |
| leg_black |
-0.142*** |
|
(0.035) |
| treat_out |
-0.351*** |
|
(0.013) |
| nonblacknonwhite |
0.089** |
|
(0.037) |
| black_medianhh |
0.191*** |
|
(0.017) |
| white_medianhh |
-0.169*** |
|
(0.015) |
| statessquireindex |
-0.516*** |
|
(0.087) |
| totalpop |
0.013*** |
|
(0.001) |
| urbanpercent |
0.123*** |
|
(0.024) |
| leg_black × treat_out |
0.229*** |
|
(0.048) |
| Num.Obs. |
4829 |
| R2 |
0.192 |
| R2 Adj. |
0.191 |
| AIC |
15973.7 |
| BIC |
16045.0 |
| Log.Lik. |
-7975.827 |
| F |
127.431 |
| RMSE |
0.53 |
plot(m1)




library(boot); library(tidyverse)
br <- causaldata::black_politicians
# Function to do IPW estimation with regression adjustment
ipwra <- function(br, index = 1:nrow(br)) {
# Apply bootstrap index
br <- br %>% slice(index)
# estimate and predict propensity score
m <- glm(leg_black ~ medianhhincom + blackpercent + leg_democrat,
data = br, family = binomial(link = 'logit'))
br <- br %>%
mutate(ps = predict(m, type = 'response'))
# Trim control observations outside of treated PS range
minps <- br %>%
filter(leg_black == 1) %>%
pull(ps) %>%
min(na.rm = TRUE)
maxps <- br %>%
filter(leg_black == 1) %>%
pull(ps) %>%
max(na.rm = TRUE)
br <- br %>%
filter(ps >= minps & ps <= maxps)
# Create IPW weights
br <- br %>%
mutate(ipw = case_when(
leg_black == 1 ~ 1/ps,
leg_black == 0 ~ 1/(1-ps)))
# Estimate difference
w_means <- br %>%
group_by(leg_black) %>%
summarize(m = weighted.mean(responded, w = ipw)) %>%
arrange(leg_black)
return(w_means$m[2] - w_means$m[1])
}
b <- boot(br, ipwra, R = 200)
# See estimate and standard error
b
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = br, statistic = ipwra, R = 200)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.1563906 0.007494783 0.1389994
install.packages("Matching")
## Warning: package 'Matching' is in use and will not be installed
library(Matching); library(tidyverse)
br <- causaldata::black_politicians
# Outcome
Y <- br %>%
pull(responded)
# Treatment
D <- br %>%
pull(leg_black)
# Matching variables
# Note select() is also in the Matching package, so we specify dplyr
X <- br %>%
dplyr::select(medianhhincom, blackpercent, leg_democrat) %>%
as.matrix()
# Weight = 2, oddly, denotes Mahalanobis distance
M <- Match(Y, D, X, Weight = 2, caliper = 1)
# See treatment effect estimate
summary(M)
##
## Estimate... -0.0073462
## AI SE...... 0.072683
## T-stat..... -0.10107
## p.val...... 0.91949
##
## Original number of observations.............. 5593
## Original number of treated obs............... 364
## Matched number of observations............... 363
## Matched number of observations (unweighted). 405
##
## Caliper (SDs)........................................ 1 1 1
## Number of obs dropped by 'exact' or 'caliper' 1
# Get matched data for use elsewhere. Note that this approach will
# duplicate each observation for each time it was matched
matched_treated <- tibble(id = M$index.treated,
weight = M$weights)
matched_control <- tibble(id = M$index.control,
weight = M$weights)
matched_sets <- bind_rows(matched_treated,
matched_control)
# Simplify to one row per observation
matched_sets <- matched_sets %>%
group_by(id) %>%
summarize(weight = sum(weight))
# And bring back to data
matched_br <- br %>%
mutate(id = row_number()) %>%
left_join(matched_sets, by = 'id')
# To be used like this! The standard errors here are wrong
p<-lm(responded~leg_black, data = matched_br, weights = weight)
p
##
## Call:
## lm(formula = responded ~ leg_black, data = matched_br, weights = weight)
##
## Coefficients:
## (Intercept) leg_black
## 0.398531 -0.007346
plot(p)



