Confounder generation with Peter Austin’s simulation method

References

Austin PC. 2009. Some methods of propensity-score matching had superior performance to others: results of an empirical investigation and Monte Carlo simulations. http://www.ncbi.nlm.nih.gov/pubmed/19197955
Li & Greene. 2013. A weighting analogue to pair matching in propensity score analysis.http://www.ncbi.nlm.nih.gov/pubmed/23902694

Load packages

library(magrittr)

Data generation outline in Part 4 of the paper

Treatment is assignmed randomly first, then, the confounders are generated based on treatment. Each confounder is independent from all others. Then the outcome is assigned by the true outcome model, the true causal effect of the treatment is 1 (coefficient of T variable), and there is no effect measure modification by any of the confounders.

set.seed(201502241)
## Create a 1000-people cohort with exposure prevalence 0.1
dat <- data.frame(T = rbinom(n = 1000, size = 1, prob = 0.1))

## Create continuous confounders
CreateContConfounder <- function(d, var) {
    sapply(var, function(T) {rnorm(n = 1, mean = T*d, sd = 1)})
}
dat$C1 <- CreateContConfounder(0.2, dat$T)
dat$C2 <- CreateContConfounder(0.3, dat$T)
dat$C3 <- CreateContConfounder(0.4, dat$T)
dat$C4 <- CreateContConfounder(0.5, dat$T)
dat$C5 <- CreateContConfounder(0.6, dat$T)

## Create binary confounders for unexposed
dat[dat$T == 0,"B1"] <- rbinom(n = sum(dat$T == 0), size = 1, prob = 0.2)
dat[dat$T == 0,"B2"] <- rbinom(n = sum(dat$T == 0), size = 1, prob = 0.3)
dat[dat$T == 0,"B3"] <- rbinom(n = sum(dat$T == 0), size = 1, prob = 0.4)
dat[dat$T == 0,"B4"] <- rbinom(n = sum(dat$T == 0), size = 1, prob = 0.5)
dat[dat$T == 0,"B5"] <- rbinom(n = sum(dat$T == 0), size = 1, prob = 0.6)

## Create binary confounders for exposed
dat[dat$T == 1,"B1"] <- rbinom(n = sum(dat$T == 1), size = 1, prob = 0.168)
dat[dat$T == 1,"B2"] <- rbinom(n = sum(dat$T == 1), size = 1, prob = 0.331)
dat[dat$T == 1,"B3"] <- rbinom(n = sum(dat$T == 1), size = 1, prob = 0.492)
dat[dat$T == 1,"B4"] <- rbinom(n = sum(dat$T == 1), size = 1, prob = 0.642)
dat[dat$T == 1,"B5"] <- rbinom(n = sum(dat$T == 1), size = 1, prob = 0.776)

## Create outcome
dat$Y <- with(dat, -2.8 + 1.5*C1 + 2*C2 + 3*C3 + 4*C4 + 5*C5 +
                  5*B1 + 4*B2 + 3*B3 + 2*B4 + 1.5*B5 +
                  T + rnorm(n = 1000, mean = 0, sd = 2))

Check data for balance

tableone::CreateTableOne(data = dat, strata = "T", factorVars = paste0("B", 1:5))

##                 Stratified by T
##                  0             1             p      test
##   n               908           92                      
##   T (mean (sd))   0.00 (0.00)   1.00 (0.00)  <0.001     
##   C1 (mean (sd)) -0.02 (1.01)   0.10 (1.04)   0.287     
##   C2 (mean (sd)) -0.03 (1.01)   0.39 (0.92)  <0.001     
##   C3 (mean (sd))  0.02 (0.98)   0.39 (1.05)   0.001     
##   C4 (mean (sd))  0.02 (1.00)   0.43 (1.01)  <0.001     
##   C5 (mean (sd))  0.01 (1.01)   0.39 (0.99)   0.001     
##   B1 = 1 (%)       183 (20.2)     11 (12.0)   0.079     
##   B2 = 1 (%)       255 (28.1)     37 (40.2)   0.020     
##   B3 = 1 (%)       344 (37.9)     42 (45.7)   0.178     
##   B4 = 1 (%)       462 (50.9)     66 (71.7)  <0.001     
##   B5 = 1 (%)       527 (58.0)     69 (75.0)   0.002     
##   Y (mean (sd))   2.41 (8.13)  10.27 (8.08)  <0.001

Regular regression

The link function is linear and there is no effect modification, so the conditional effect from the linear model is equivalent to the marginal causal effect. The model structure is the same as the true outcome generation model.

resLm <- lm(formula = Y ~ T + C1 + C2 + C3 + C4 + C5 + B1 + B2 + B3 + B4 + B5,
            data    = dat)
summary(resLm)

## 
## Call:
## lm(formula = Y ~ T + C1 + C2 + C3 + C4 + C5 + B1 + B2 + B3 + 
##     B4 + B5, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9137 -1.3122  0.0158  1.2372  6.1564 
## 
## Coefficients:
##             Estimate Std. Error t value      Pr(>|t|)    
## (Intercept) -2.96082    0.13756 -21.524       < 2e-16 ***
## T            1.29645    0.22045   5.881 0.00000000558 ***
## C1           1.48270    0.06064  24.450       < 2e-16 ***
## C2           2.07050    0.06084  34.031       < 2e-16 ***
## C3           2.98016    0.06195  48.106       < 2e-16 ***
## C4           3.94885    0.06158  64.122       < 2e-16 ***
## C5           4.83431    0.06065  79.707       < 2e-16 ***
## B1           5.42905    0.15549  34.917       < 2e-16 ***
## B2           3.83676    0.13504  28.412       < 2e-16 ***
## B3           3.01795    0.12598  23.956       < 2e-16 ***
## B4           2.10852    0.12391  17.017       < 2e-16 ***
## B5           1.49469    0.12538  11.921       < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.928 on 988 degrees of freedom
## Multiple R-squared:  0.9483, Adjusted R-squared:  0.9477 
## F-statistic:  1647 on 11 and 988 DF,  p-value: < 2.2e-16

Parametric g-formula

This gives the marginal causal effect in the population based on the outcome model (no misspecification here).

datT1 <- datT0 <- dat
datT1$T <- 1
datT0$T <- 0
mean(predict(resLm, datT1)) - mean(predict(resLm, datT0))

## [1] 1.296446

Inverse probability of treatment weighting

This gives the marginal causal effect in the population based on the treatment model. It is conceptually difficult, as the true model is “treatment generating confounders” rather than “confounders determining treatment”. The result suggests potential PS model mispecification.

resGlm <- glm(formula = T ~ C1 + C2 + C3 + C4 + C5 + B1 + B2 + B3 + B4 + B5,
              data    = dat, family = binomial(link = "logit"))

dat$ps <- predict(resGlm, type = "response")
dat$ipw <- 1 / ((dat$T * dat$ps) + ((1 - dat$T) * (1 - dat$ps)))
summary(dat$ipw)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.001   1.024   1.063   1.926   1.179 109.000

resLmIpw <- lm(formula = Y ~ T, weight = ipw, data = dat)

## Load sandwich package for robust sandwich covariance matrix estimators
library(sandwich)
## Load lmtest package for coeftest
library(lmtest)
## Test using robust sandwich covariance matrix estimators
coeftest(resLmIpw, vcov = sandwich)

## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.98844    0.27976 10.6820  < 2e-16 ***
## T            2.50747    1.39647  1.7956  0.07286 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Matching (algorithmic)

Algorithmic PS matching discards unmatched unexposed individuals. There appears to be less influence of PS model misspecification here than IPTW. In the presence of effect modification their estimands will not match anyway.

library(Matching)
listMatch <- Match(Tr       = dat$T,                      # Need to be in 0,1
                   X        = log(dat$ps / (1 - dat$ps)), # logit of PS,i.e., log(PS/(1-PS))
                   M        = 1,                          # 1:1 match
                   caliper  = 0.2,                        # caliper = x * SD(each matching variable)
                   replace  = FALSE,
                   ties     = TRUE,
                   version  = "fast")

## Show number matched
print(summary(listMatch))

## 
## Estimate...  0 
## SE.........  0 
## T-stat.....  NaN 
## p.val......  NA 
## 
## Original number of observations..............  1000 
## Original number of treated obs...............  92 
## Matched number of observations...............  89 
## Matched number of observations  (unweighted).  89 
## 
## Caliper (SDs)........................................   0.2 
## Number of obs dropped by 'exact' or 'caliper'  3

## Extract matched data
psMatchData <- dat[unlist(listMatch[c("index.treated","index.control")]), ]

## PS matched cohort
lm(formula = Y ~ T, data = psMatchData) %>% summary

## 
## Call:
## lm(formula = Y ~ T, data = psMatchData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5061  -5.0613   0.0102   5.6141  24.2808 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.1820     0.7889   10.37   <2e-16 ***
## T             1.5171     1.1157    1.36    0.176    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.442 on 176 degrees of freedom
## Multiple R-squared:  0.0104, Adjusted R-squared:  0.004775 
## F-statistic: 1.849 on 1 and 176 DF,  p-value: 0.1756

Matching weight

This is by the matching weight method (Li & Greene 2013). The estimand is theoretically the same as the algorithmic matching method, and likely more efficient. The effect estimate may be less biased.

## Matchin weight
dat$mwNum <- pmin(dat$ps, (1 - dat$ps))
dat$mw <- dat$mwNum * dat$ipw
summary(dat$mw)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0009748 0.0243500 0.0627800 0.1816000 0.1788000 1.0000000

## Weighted analysis
resLmMw <- lm(formula = Y ~ T, weight = mw, data = dat)

## Test using robust sandwich covariance matrix estimators
coeftest(resLmMw, vcov = sandwich)

## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.70704    0.35485 24.5373   <2e-16 ***
## T            1.22103    0.86860  1.4057   0.1601    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Top Page: http://rpubs.com/kaz_yos/
github: https://github.com/kaz-yos