Reference book: Hernán, Miguel A, and James M Robins. 2020. Causal Inference: What If. Boca Raton: Chapman & Hall/CRC.

Inverse probability of treatment weight

Data preparation

  • Load data
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## here() starts at C:/Users/hed2/Desktop
  • Data transform
nhefs$cens <- ifelse(is.na(nhefs$wt82), 1, 0)   

nhefs.nmv <- nhefs[which(!is.na(nhefs$wt82)),] #   pro visional ignore subjects with missing values for weight in 1982 
  • Data exploration
lm(wt82_71 ~ qsmk, data=nhefs.nmv) # qsmk  
## 
## Call:
## lm(formula = wt82_71 ~ qsmk, data = nhefs.nmv)
## 
## Coefficients:
## (Intercept)         qsmk  
##       1.984        2.541
  • Averages in different groups
predict(lm(wt82_71 ~ qsmk, data=nhefs.nmv), data.frame(qsmk=1)) # Smoking cessation
##        1 
## 4.525079
predict(lm(wt82_71 ~ qsmk, data=nhefs.nmv), data.frame(qsmk=0)) # No smoking cessation
##        1 
## 1.984498
  • Problem: age is different (unbalance between two groups)
# Table
summary(nhefs.nmv[which(nhefs.nmv$qsmk==0),]$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   33.00   42.00   42.79   51.00   72.00
summary(nhefs.nmv[which(nhefs.nmv$qsmk==1),]$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   35.00   46.00   46.17   56.00   74.00

Inverse probability of treatment weight

(Weighting the exposure)

# Estimation of ip weights via a logistic model
fit <- glm(qsmk ~ sex + race + age + I(age^2) + 
  as.factor(education) + smokeintensity + 
  I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) + 
  as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2), 
           family = binomial(), data = nhefs.nmv)
summary(fit)
## 
## Call:
## glm(formula = qsmk ~ sex + race + age + I(age^2) + as.factor(education) + 
##     smokeintensity + I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) + 
##     as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2), 
##     family = binomial(), data = nhefs.nmv)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5127  -0.7907  -0.6387   0.9832   2.3729  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -2.2425191  1.3808360  -1.624 0.104369    
## sex                   -0.5274782  0.1540496  -3.424 0.000617 ***
## race                  -0.8392636  0.2100665  -3.995 6.46e-05 ***
## age                    0.1212052  0.0512663   2.364 0.018068 *  
## I(age^2)              -0.0008246  0.0005361  -1.538 0.124039    
## as.factor(education)2 -0.0287755  0.1983506  -0.145 0.884653    
## as.factor(education)3  0.0864318  0.1780850   0.485 0.627435    
## as.factor(education)4  0.0636010  0.2732108   0.233 0.815924    
## as.factor(education)5  0.4759606  0.2262237   2.104 0.035384 *  
## smokeintensity        -0.0772704  0.0152499  -5.067 4.04e-07 ***
## I(smokeintensity^2)    0.0010451  0.0002866   3.647 0.000265 ***
## smokeyrs              -0.0735966  0.0277775  -2.650 0.008061 ** 
## I(smokeyrs^2)          0.0008441  0.0004632   1.822 0.068398 .  
## as.factor(exercise)1   0.3548405  0.1801351   1.970 0.048855 *  
## as.factor(exercise)2   0.3957040  0.1872400   2.113 0.034571 *  
## as.factor(active)1     0.0319445  0.1329372   0.240 0.810100    
## as.factor(active)2     0.1767840  0.2149720   0.822 0.410873    
## wt71                  -0.0152357  0.0263161  -0.579 0.562625    
## I(wt71^2)              0.0001352  0.0001632   0.829 0.407370    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1786.1  on 1565  degrees of freedom
## Residual deviance: 1676.9  on 1547  degrees of freedom
## AIC: 1714.9
## 
## Number of Fisher Scoring iterations: 4
p.qsmk.obs <- ifelse(nhefs.nmv$qsmk == 0, 1 - predict(fit, type = "response"),
                     predict(fit, type = "response"))

nhefs.nmv$w <- 1/p.qsmk.obs

summary(nhefs.nmv$w)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.054   1.230   1.373   1.996   1.990  16.700
library("geepack")
# what is id
msm.w <- geeglm(wt82_71 ~ qsmk, data=nhefs.nmv, weights=w,id = seqn,corstr="independence")
summary(msm.w)
## 
## Call:
## geeglm(formula = wt82_71 ~ qsmk, data = nhefs.nmv, weights = w, 
##     id = seqn, corstr = "independence")
## 
##  Coefficients:
##             Estimate Std.err  Wald Pr(>|W|)    
## (Intercept)   1.7800  0.2247 62.73 2.33e-15 ***
## qsmk          3.4405  0.5255 42.87 5.86e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)    65.06   4.221
## Number of clusters:   1566  Maximum cluster size: 1
# no association between sex and qsmk in pseudo-population after weighting
xtabs(nhefs.nmv$w ~ nhefs.nmv$sex + nhefs.nmv$qsmk)
##              nhefs.nmv$qsmk
## nhefs.nmv$sex     0     1
##             0 763.6 763.6
##             1 801.7 797.2
# "check" for positivity (White women)
table(nhefs.nmv$age[nhefs.nmv$race == 0 & nhefs.nmv$sex == 1], 
      nhefs.nmv$qsmk[nhefs.nmv$race == 0 & nhefs.nmv$sex == 1])
##     
##       0  1
##   25 24  3
##   26 14  5
##   27 18  2
##   28 20  5
##   29 15  4
##   30 14  5
##   31 11  5
##   32 14  7
##   33 12  3
##   34 22  5
##   35 16  5
##   36 13  3
##   37 14  1
##   38  6  2
##   39 19  4
##   40 10  4
##   41 13  3
##   42 16  3
##   43 14  3
##   44  9  4
##   45 12  5
##   46 19  4
##   47 19  4
##   48 19  4
##   49 11  3
##   50 18  4
##   51  9  3
##   52 11  3
##   53 11  4
##   54 17  9
##   55  9  4
##   56  8  7
##   57  9  2
##   58  8  4
##   59  5  4
##   60  5  4
##   61  5  2
##   62  6  5
##   63  3  3
##   64  7  1
##   65  3  2
##   66  4  0
##   67  2  0
##   69  6  2
##   70  2  1
##   71  0  1
##   72  2  2
##   74  0  1

The inverse probability weight at this time can be: the treatment group is Pr[A=1]/f(A|L) (Or 1/f (A|L), Or 0.5/f (A|L)), and the non-treatment group is Pr[A=0]/f(A|L). WA =1/ f (A| L) is called non-stable weights, while SWA = f (A)/ f (A| L) is called stable weights.

SWA

  • Estimation of denominator
# estimation of denominator of ip weights (same as above, conditional probability)
denom.fit <- glm(qsmk ~ as.factor(sex) + as.factor(race) + age + I(age^2) + 
  as.factor(education) + smokeintensity + 
  I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) + 
  as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2), 
                 family = binomial(), data = nhefs.nmv)
summary(denom.fit)
## 
## Call:
## glm(formula = qsmk ~ as.factor(sex) + as.factor(race) + age + 
##     I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) + 
##     smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + 
##     wt71 + I(wt71^2), family = binomial(), data = nhefs.nmv)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.513  -0.791  -0.639   0.983   2.373  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -2.242519   1.380836   -1.62  0.10437    
## as.factor(sex)1       -0.527478   0.154050   -3.42  0.00062 ***
## as.factor(race)1      -0.839264   0.210067   -4.00  6.5e-05 ***
## age                    0.121205   0.051266    2.36  0.01807 *  
## I(age^2)              -0.000825   0.000536   -1.54  0.12404    
## as.factor(education)2 -0.028776   0.198351   -0.15  0.88465    
## as.factor(education)3  0.086432   0.178085    0.49  0.62744    
## as.factor(education)4  0.063601   0.273211    0.23  0.81592    
## as.factor(education)5  0.475961   0.226224    2.10  0.03538 *  
## smokeintensity        -0.077270   0.015250   -5.07  4.0e-07 ***
## I(smokeintensity^2)    0.001045   0.000287    3.65  0.00027 ***
## smokeyrs              -0.073597   0.027777   -2.65  0.00806 ** 
## I(smokeyrs^2)          0.000844   0.000463    1.82  0.06840 .  
## as.factor(exercise)1   0.354841   0.180135    1.97  0.04885 *  
## as.factor(exercise)2   0.395704   0.187240    2.11  0.03457 *  
## as.factor(active)1     0.031944   0.132937    0.24  0.81010    
## as.factor(active)2     0.176784   0.214972    0.82  0.41087    
## wt71                  -0.015236   0.026316   -0.58  0.56262    
## I(wt71^2)              0.000135   0.000163    0.83  0.40737    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1786.1  on 1565  degrees of freedom
## Residual deviance: 1676.9  on 1547  degrees of freedom
## AIC: 1715
## 
## Number of Fisher Scoring iterations: 4
pd.qsmk <- predict(denom.fit, type = "response")
  • Estimation of numerator
# Estimation of numerator of ip weights (probability (or mean) of treatment, marginal probability)
numer.fit <- glm(qsmk~1, family = binomial(), data = nhefs.nmv)
summary(numer.fit)
## 
## Call:
## glm(formula = qsmk ~ 1, family = binomial(), data = nhefs.nmv)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.771  -0.771  -0.771   1.648   1.648  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0598     0.0578   -18.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1786.1  on 1565  degrees of freedom
## Residual deviance: 1786.1  on 1565  degrees of freedom
## AIC: 1788
## 
## Number of Fisher Scoring iterations: 4
pn.qsmk <- predict(numer.fit, type = "response")
  • Calculate the IPW
nhefs.nmv$sw <- ifelse(nhefs.nmv$qsmk == 0, ((1-pn.qsmk)/(1-pd.qsmk)),
                    (pn.qsmk/pd.qsmk))

# 0.257 is the probability of a treatment
summary(nhefs.nmv$sw)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.331   0.867   0.950   0.999   1.079   4.298
summary(nhefs.nmv$w)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.05    1.23    1.37    2.00    1.99   16.70
  • Calculate ATE
msm.sw <- geeglm(wt82_71 ~ qsmk, data=nhefs.nmv, weights=sw, id=seqn,
                corstr="independence")
summary(msm.sw)
## 
## Call:
## geeglm(formula = wt82_71 ~ qsmk, data = nhefs.nmv, weights = sw, 
##     id = seqn, corstr = "independence")
## 
##  Coefficients:
##             Estimate Std.err Wald Pr(>|W|)    
## (Intercept)    1.780   0.225 62.7  2.3e-15 ***
## qsmk           3.441   0.525 42.9  5.9e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)     60.7    3.71
## Number of clusters:   1566  Maximum cluster size: 1
  • Balance assessing
# no association between sex and qsmk in pseudo-population
xtabs(nhefs.nmv$sw ~ nhefs.nmv$sex + nhefs.nmv$qsmk)
##              nhefs.nmv$qsmk
## nhefs.nmv$sex   0   1
##             0 567 197
##             1 595 205

Treatment is a continuous variable

We assume that the probability density f (A|L) is a normal distribution, u=E(A|L), and the density f(A) in the molecule is also normally distributed. Because effect estimates are very sensitive to model choice for the conditional density f(A|L), we need to be very careful when using inverse probability weighting for continuous variables.

# Analysis restricted to subjects reporting <=25 cig/day at baseline
nhefs.nmv.s <- subset(nhefs.nmv, smokeintensity <=25)

# estimation of denominator of ip weights
den.fit.obj <- lm(smkintensity82_71 ~ as.factor(sex) + 
  as.factor(race) + age + I(age^2) + 
  as.factor(education) + smokeintensity + I(smokeintensity^2) +
  smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + wt71  + 
  I(wt71^2), data = nhefs.nmv.s)

p.den <- predict(den.fit.obj, type = "response")

dens.den <- dnorm(nhefs.nmv.s$smkintensity82_71, p.den, summary(den.fit.obj)$sigma) # ipw denominator
# dnorm(x, mean, sd)

# estimation of numerator of ip weights
num.fit.obj <- lm(smkintensity82_71 ~ 1, data = nhefs.nmv.s) # ipw numerator
p.num <- predict(num.fit.obj, type = "response")
dens.num <- dnorm(nhefs.nmv.s$smkintensity82_71, p.num, summary(num.fit.obj)$sigma)

nhefs.nmv.s$sw.a = dens.num/dens.den
summary(nhefs.nmv.s$sw.a)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.19    0.89    0.97    1.00    1.05    5.10
msm.sw.cont <- geeglm(wt82_71 ~ smkintensity82_71 + I(smkintensity82_71*smkintensity82_71), 
                 data=nhefs.nmv.s, weights=sw.a, id=seqn, corstr="independence")
summary(msm.sw.cont)
## 
## Call:
## geeglm(formula = wt82_71 ~ smkintensity82_71 + I(smkintensity82_71 * 
##     smkintensity82_71), data = nhefs.nmv.s, weights = sw.a, id = seqn, 
##     corstr = "independence")
## 
##  Coefficients:
##                                          Estimate  Std.err  Wald Pr(>|W|)    
## (Intercept)                               2.00452  0.29512 46.13  1.1e-11 ***
## smkintensity82_71                        -0.10899  0.03154 11.94  0.00055 ***
## I(smkintensity82_71 * smkintensity82_71)  0.00269  0.00242  1.24  0.26489    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)     60.5     4.5
## Number of clusters:   1162  Maximum cluster size: 1

The outcome is a dichotomous variable

table(nhefs.nmv$qsmk, nhefs.nmv$death) # qsmk is treatment # death is outcome
##    
##       0   1
##   0 963 200
##   1 312  91
# First, estimation of stabilized weights sw.a  

# Second, fit logistic model below # nhefs.nmv.s/nhefs.nmv
msm.logistic <- geeglm(death ~ qsmk, data=nhefs.nmv.s, weights=sw.a, # sw /sw.a
                      id=seqn, family=binomial(), corstr="independence")
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(msm.logistic)
## 
## Call:
## geeglm(formula = death ~ qsmk, family = binomial(), data = nhefs.nmv.s, 
##     weights = sw.a, id = seqn, corstr = "independence")
## 
##  Coefficients:
##             Estimate Std.err   Wald Pr(>|W|)    
## (Intercept)  -1.5026  0.0981 234.80   <2e-16 ***
## qsmk          0.0369  0.1736   0.05     0.83    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)        1   0.082
## Number of clusters:   1162  Maximum cluster size: 1

Effect Modification

However, when effect modification is present (exposure and a modifier, qsmk and sex), we can put other covariates in the marginal structure model, even if these covariates are not confounding variables. So our numerator of stable weights can be f(A) or f(A|V).

table(nhefs.nmv$sex)
## 
##   0   1 
## 762 804
# estimation of denominator of ip weights
denom.fit <- glm(qsmk ~ as.factor(sex) + as.factor(race) + age + I(age^2) + 
  as.factor(education) + smokeintensity + 
  I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) + 
  as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2), 
                 family = binomial(), data = nhefs.nmv) # denominator
summary(denom.fit)
## 
## Call:
## glm(formula = qsmk ~ as.factor(sex) + as.factor(race) + age + 
##     I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) + 
##     smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + 
##     wt71 + I(wt71^2), family = binomial(), data = nhefs.nmv)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.513  -0.791  -0.639   0.983   2.373  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -2.242519   1.380836   -1.62  0.10437    
## as.factor(sex)1       -0.527478   0.154050   -3.42  0.00062 ***
## as.factor(race)1      -0.839264   0.210067   -4.00  6.5e-05 ***
## age                    0.121205   0.051266    2.36  0.01807 *  
## I(age^2)              -0.000825   0.000536   -1.54  0.12404    
## as.factor(education)2 -0.028776   0.198351   -0.15  0.88465    
## as.factor(education)3  0.086432   0.178085    0.49  0.62744    
## as.factor(education)4  0.063601   0.273211    0.23  0.81592    
## as.factor(education)5  0.475961   0.226224    2.10  0.03538 *  
## smokeintensity        -0.077270   0.015250   -5.07  4.0e-07 ***
## I(smokeintensity^2)    0.001045   0.000287    3.65  0.00027 ***
## smokeyrs              -0.073597   0.027777   -2.65  0.00806 ** 
## I(smokeyrs^2)          0.000844   0.000463    1.82  0.06840 .  
## as.factor(exercise)1   0.354841   0.180135    1.97  0.04885 *  
## as.factor(exercise)2   0.395704   0.187240    2.11  0.03457 *  
## as.factor(active)1     0.031944   0.132937    0.24  0.81010    
## as.factor(active)2     0.176784   0.214972    0.82  0.41087    
## wt71                  -0.015236   0.026316   -0.58  0.56262    
## I(wt71^2)              0.000135   0.000163    0.83  0.40737    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1786.1  on 1565  degrees of freedom
## Residual deviance: 1676.9  on 1547  degrees of freedom
## AIC: 1715
## 
## Number of Fisher Scoring iterations: 4
pd.qsmk <- predict(denom.fit, type = "response")
# estimation of numerator of ip weights
numer.fit <- glm(qsmk~as.factor(sex), family = binomial(), data = nhefs.nmv) # numerator
summary(numer.fit)
## 
## Call:
## glm(formula = qsmk ~ as.factor(sex), family = binomial(), data = nhefs.nmv)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.825  -0.825  -0.719   1.576   1.720  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.9016     0.0799  -11.28   <2e-16 ***
## as.factor(sex)1  -0.3202     0.1160   -2.76   0.0058 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1786.1  on 1565  degrees of freedom
## Residual deviance: 1778.4  on 1564  degrees of freedom
## AIC: 1782
## 
## Number of Fisher Scoring iterations: 4
pn.qsmk <- predict(numer.fit, type = "response")
nhefs.nmv$sw.a <- ifelse(nhefs.nmv$qsmk == 0, ((1-pn.qsmk)/(1-pd.qsmk)),
                     (pn.qsmk/pd.qsmk))

summary(nhefs.nmv$sw.a)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.29    0.88    0.96    1.00    1.08    3.80
# Estimating parameters of a marginal structural mean model
msm.emm <- geeglm(wt82_71~as.factor(qsmk) + as.factor(sex) 
                  + as.factor(qsmk):as.factor(sex), data=nhefs.nmv, 
                  weights=sw.a, id=seqn, corstr="independence")
summary(msm.emm)
## 
## Call:
## geeglm(formula = wt82_71 ~ as.factor(qsmk) + as.factor(sex) + 
##     as.factor(qsmk):as.factor(sex), data = nhefs.nmv, weights = sw.a, 
##     id = seqn, corstr = "independence")
## 
##  Coefficients:
##                                  Estimate  Std.err  Wald Pr(>|W|)    
## (Intercept)                       1.78445  0.30984 33.17  8.5e-09 ***
## as.factor(qsmk)1                  3.52198  0.65707 28.73  8.3e-08 ***
## as.factor(sex)1                  -0.00872  0.44882  0.00     0.98    
## as.factor(qsmk)1:as.factor(sex)1 -0.15948  1.04608  0.02     0.88    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)     60.8    3.71
## Number of clusters:   1566  Maximum cluster size: 1

If we want to explore the interaction between two treatment variables A and B, we need to put both A and B into the marginal structure model. When calculating the inverse probability weight, the denominator will include these two joint distribution of variables.

Censoring and Missing Values

A quick view for censoring data

table(nhefs$qsmk, nhefs$cens)
##    
##        0    1
##   0 1163   38
##   1  403   25
summary(nhefs[which(nhefs$cens==0),]$wt71)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    39.6    59.5    69.2    70.8    79.8   151.7
summary(nhefs[which(nhefs$cens==1),]$wt71)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    36.2    63.1    72.1    76.6    87.9   169.2

Calculate the weights for treatment and censor

  • For completed data
# for completed data

# estimation of denominator of ip weights for A
denom.fit <- glm(qsmk ~ as.factor(sex) + as.factor(race) + age + I(age^2) + 
  as.factor(education) + smokeintensity + 
  I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) + 
  as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2), 
                 family = binomial(), data = nhefs)
summary(denom.fit)
## 
## Call:
## glm(formula = qsmk ~ as.factor(sex) + as.factor(race) + age + 
##     I(age^2) + as.factor(education) + smokeintensity + I(smokeintensity^2) + 
##     smokeyrs + I(smokeyrs^2) + as.factor(exercise) + as.factor(active) + 
##     wt71 + I(wt71^2), family = binomial(), data = nhefs)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.465  -0.804  -0.646   1.058   2.355  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -1.988902   1.241279   -1.60  0.10909    
## as.factor(sex)1       -0.507522   0.148232   -3.42  0.00062 ***
## as.factor(race)1      -0.850231   0.205872   -4.13  3.6e-05 ***
## age                    0.103013   0.048900    2.11  0.03515 *  
## I(age^2)              -0.000605   0.000507   -1.19  0.23297    
## as.factor(education)2 -0.098320   0.190655   -0.52  0.60607    
## as.factor(education)3  0.015699   0.170714    0.09  0.92673    
## as.factor(education)4 -0.042526   0.264276   -0.16  0.87216    
## as.factor(education)5  0.379663   0.220395    1.72  0.08495 .  
## smokeintensity        -0.065156   0.014759   -4.41  1.0e-05 ***
## I(smokeintensity^2)    0.000846   0.000276    3.07  0.00216 ** 
## smokeyrs              -0.073371   0.026996   -2.72  0.00657 ** 
## I(smokeyrs^2)          0.000838   0.000443    1.89  0.05867 .  
## as.factor(exercise)1   0.291412   0.173554    1.68  0.09314 .  
## as.factor(exercise)2   0.355052   0.179929    1.97  0.04846 *  
## as.factor(active)1     0.010875   0.129832    0.08  0.93324    
## as.factor(active)2     0.068312   0.208727    0.33  0.74346    
## wt71                  -0.012848   0.022283   -0.58  0.56423    
## I(wt71^2)              0.000121   0.000135    0.89  0.37096    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1876.3  on 1628  degrees of freedom
## Residual deviance: 1766.7  on 1610  degrees of freedom
## AIC: 1805
## 
## Number of Fisher Scoring iterations: 4
pd.qsmk <- predict(denom.fit, type = "response")
# estimation of numerator of ip weights for A
numer.fit <- glm(qsmk~1, family = binomial(), data = nhefs)
summary(numer.fit)
## 
## Call:
## glm(formula = qsmk ~ 1, family = binomial(), data = nhefs)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.781  -0.781  -0.781   1.635   1.635  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.0318     0.0563   -18.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1876.3  on 1628  degrees of freedom
## Residual deviance: 1876.3  on 1628  degrees of freedom
## AIC: 1878
## 
## Number of Fisher Scoring iterations: 4
pn.qsmk <- predict(numer.fit, type = "response")
  • For censoring data
# for censoring data 
# as.factor(qsmk) likes a effect modification

# estimation of denominator of ip weights for C
denom.cens <- glm(cens ~ as.factor(qsmk) + as.factor(sex) + 
  as.factor(race) + age + I(age^2) + 
  as.factor(education) + smokeintensity + 
  I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) + 
  as.factor(exercise) + as.factor(active) + wt71 + I(wt71^2), 
                  family = binomial(), data = nhefs)
summary(denom.cens)
## 
## Call:
## glm(formula = cens ~ as.factor(qsmk) + as.factor(sex) + as.factor(race) + 
##     age + I(age^2) + as.factor(education) + smokeintensity + 
##     I(smokeintensity^2) + smokeyrs + I(smokeyrs^2) + as.factor(exercise) + 
##     as.factor(active) + wt71 + I(wt71^2), family = binomial(), 
##     data = nhefs)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.097  -0.287  -0.207  -0.157   2.996  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)   
## (Intercept)            4.014466   2.576106    1.56   0.1192   
## as.factor(qsmk)1       0.516867   0.287716    1.80   0.0724 . 
## as.factor(sex)1        0.057313   0.330278    0.17   0.8622   
## as.factor(race)1      -0.012271   0.452489   -0.03   0.9784   
## age                   -0.269729   0.117465   -2.30   0.0217 * 
## I(age^2)               0.002884   0.001114    2.59   0.0096 **
## as.factor(education)2 -0.440788   0.419399   -1.05   0.2933   
## as.factor(education)3 -0.164688   0.370547   -0.44   0.6567   
## as.factor(education)4  0.138447   0.569797    0.24   0.8080   
## as.factor(education)5 -0.382382   0.560181   -0.68   0.4949   
## smokeintensity         0.015712   0.034732    0.45   0.6510   
## I(smokeintensity^2)   -0.000113   0.000606   -0.19   0.8517   
## smokeyrs               0.078597   0.074958    1.05   0.2944   
## I(smokeyrs^2)         -0.000557   0.001032   -0.54   0.5894   
## as.factor(exercise)1  -0.971471   0.387810   -2.51   0.0122 * 
## as.factor(exercise)2  -0.583989   0.372313   -1.57   0.1168   
## as.factor(active)1    -0.247479   0.325455   -0.76   0.4470   
## as.factor(active)2     0.706583   0.396458    1.78   0.0747 . 
## wt71                  -0.087887   0.040012   -2.20   0.0281 * 
## I(wt71^2)              0.000635   0.000226    2.81   0.0049 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 533.36  on 1628  degrees of freedom
## Residual deviance: 465.36  on 1609  degrees of freedom
## AIC: 505.4
## 
## Number of Fisher Scoring iterations: 7
pd.cens <- 1-predict(denom.cens, type = "response")
# estimation of numerator of ip weights for Censor data
numer.cens <- glm(cens~ as.factor(qsmk), family = binomial(), data = nhefs)
summary(numer.cens)
## 
## Call:
## glm(formula = cens ~ as.factor(qsmk), family = binomial(), data = nhefs)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.347  -0.254  -0.254  -0.254   2.628  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -3.421      0.165  -20.75   <2e-16 ***
## as.factor(qsmk)1    0.641      0.264    2.43    0.015 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 533.36  on 1628  degrees of freedom
## Residual deviance: 527.76  on 1627  degrees of freedom
## AIC: 531.8
## 
## Number of Fisher Scoring iterations: 6
pn.cens <- 1-predict(numer.cens, type = "response")
  • Time treatment_w and censor_w

P(A|L)~P(A|L,C)*P(C|A,L)

# times a_w and censor_w

nhefs$sw.a <- ifelse(nhefs$qsmk == 0, ((1-pn.qsmk)/(1-pd.qsmk)),
                     (pn.qsmk/pd.qsmk))

nhefs$sw.c <- pn.cens/pd.cens   

nhefs$sw <- nhefs$sw.c* nhefs$sw.a  #times

summary(nhefs$sw.a)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.33    0.86    0.95    1.00    1.08    4.21
summary(nhefs$sw)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.35    0.86    0.94    1.01    1.08   12.86
msm.sw <- geeglm(wt82_71~qsmk, data=nhefs, 
                  weights=sw, id=seqn, corstr="independence")
summary(msm.sw)
## 
## Call:
## geeglm(formula = wt82_71 ~ qsmk, data = nhefs, weights = sw, 
##     id = seqn, corstr = "independence")
## 
##  Coefficients:
##             Estimate Std.err Wald Pr(>|W|)    
## (Intercept)    1.662   0.233 51.0  9.3e-13 ***
## qsmk           3.496   0.526 44.2  2.9e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)     61.8    3.83
## Number of clusters:   1566  Maximum cluster size: 1
  • CI
beta <- coef(msm.sw)
SE <- coef(summary(msm.sw))[,2]
lcl <- beta-qnorm(0.975)*SE 
ucl <- beta+qnorm(0.975)*SE
cbind(beta, lcl, ucl)
##             beta  lcl  ucl
## (Intercept) 1.66 1.21 2.12
## qsmk        3.50 2.47 4.53