Purpose: Simulate a dataset with a time-varying exposure and apply a marginal structural model.

For simplicity, the model is non-parametric (i.e. doesn’t use regression).

#load the packages.
library(tidyverse) #see http://r4ds.had.co.nz/ for background on this set of packages.
library(simcausal) #for simulating a dataset based on a DAG

Goal: assess effect of being physically active at both time points {1, 1} compared with not being physically active at either time point {0, 0}.

Assume all variables are dichotomous. Physical activity (A1) = 1 denotes being active, 0 denotes inactive, year 1.
Diet (D2) = 1 denotes healthy diet; 0 denotes unhealthy diet, year 2.
Physical activity (A3) = 1 denotes being active, 0 denotes inactive, year 3.
Mortality (M) = 1 denotes death, year 4; 0 else.

D <- DAG.empty()
D <- D +
  node("A1", distr = "rbern", prob = 0.3) + #activity, first year
  node("D2", distr = "rbern", prob = plogis(-2 + 2*A1)) + #diet, second year
  node("A3", distr = "rbern", prob = plogis(-2 + 2*(A1) + 2*D2))+ #activity, third year

    #death (M for mortality, year 4)
  node("M", distr = "rbern", prob = plogis(-2 + -.5*(A1) + -2*(A3) + -1*D2)) 
              

#set the DAG
D <- set.DAG(D)

Plot the DAG
A1: Physical activity at year 1
D2: diet at year 2
A3: Physical activity at year 3
M: mortality at yearr 4

plotDAG(D, xjitter = 0.3, yjitter = 0.2, edge_attrs = 
          list(width = 0.5, arrow.width = 0.4, arrow.size = 0.5),
        vertex_attrs = list(size = 12, label.cex = 1))

Simualate a dataset based on the DAG Setting the seed makes it so that the same numbers appear each time you run the code.

dat <- sim(D,n=10000, rndseed = 4) #simulate a dataset based on the DAG

#look at marginal distribution of the exposure and outcome.
table(dat$A1) 
## 
##    0    1 
## 7004 2996
table(dat$A3) 
## 
##    0    1 
## 6799 3201
table(dat$D2)
## 
##    0    1 
## 7687 2313
table(dat$M)
## 
##    0    1 
## 9191  809

Take a peak at the simulated dataset.

head(dat)
##   ID A1 D2 A3 M
## 1  1  0  0  0 0
## 2  2  0  0  0 0
## 3  3  0  0  1 0
## 4  4  0  0  0 0
## 5  5  1  0  0 0
## 6  6  0  0  0 0

Look at crude risk difference

crude  = dat  %>% 
  group_by(A1, A3) %>% 
  summarise(m_perc = sum(M) / n())

(RD_crude = filter(crude, A1==1 & A3==1) - filter(crude, A1==0 & A3==0)) 
##   A1 A3     m_perc
## 1  1  1 -0.1121837

Interpret: the observed risk difference comparing doubly exposed vs. doubly unexposed is -0.112, The doubly exposed (active at both time points) have a risk of death 0.11 lower than those doubly unexposed (inactive at both time points).

Calculate probability of exposure at each time point.

#1. Find P(A1) 
time1 = dat %>% 
  summarise(P_A1 = sum(A1)/n()) #exogenous, so marginal is OK

#2. Find P(A3)
time3 = dat %>%
  group_by(A1, D2) %>% #conditional on A1 and D2 because A3 depends on those for its values.
  summarise(P_A3 = sum(A3)/n()) 

time1
##     P_A1
## 1 0.2996
time3
## # A tibble: 4 x 3
## # Groups:   A1 [?]
##      A1    D2  P_A3
##   <int> <int> <dbl>
## 1     0     0 0.119
## 2     0     1 0.507
## 3     1     0 0.484
## 4     1     1 0.884

Link probability of exposure at each time point to the original dataset and calculate inverse-probability-of-treatment weights (IPTW)

#Join these back to the main dataset

dat_a1  = cbind(dat, time1)  #cbind joins two datasets together side by side. not using left_join here because P_A1 is the same for everybody.

dat_A3 = dat_a1 %>%
  left_join(time3, by = c("A1", "D2")) %>% #merge by the variables that cause A3

  #if exposed, assign the probability of exposure. 
  #if unexposed assign 1-the probability of exposure
  mutate(P_A1_received = case_when(
    A1==1 ~ P_A1,
    A1==0 ~ 1-P_A1)) %>%
  mutate(P_A3_received = case_when(
      A3==1 ~ P_A3,
      A3==0 ~ 1-P_A3)) %>%
  
  
  #this is the probability that a subject's regimen is as observed given her value of physical activity at the first time point and her value of diet at the second year.
    mutate(P_A_both = P_A1_received*P_A3_received) %>%
    
  #the weights, for each subject, are 1 over the probability of the regimen she actually followed, given treatment history and covariate history
  mutate(IPTW = 1/P_A_both)

#some checks
table(dat_A3$P_A_both)  #there should be 2^3 = 8 possibilities for IPTW
## 
## 0.0347391420911528 0.0835994177583697   0.14501914893617 
##                173                738                728 
##   0.15458085106383  0.264860857908847  0.345507917174178 
##                776               1319                405 
##  0.354892082825822   0.61680058224163 
##                416               5445
summary(dat_A3$P_A_both) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03474 0.26486 0.61680 0.42886 0.61680 0.61680
table(dat_A3$IPTW)
## 
## 1.62126954609173 2.81775798444845 2.89428968279149 3.77556732200933 
##             5445              416              405             1319 
## 6.46910657508981 6.89564107454628 11.9618057973841 28.7859728192503 
##              776              728              738              173
head(dat_A3)
##   ID A1 D2 A3 M   P_A1      P_A3 P_A1_received P_A3_received   P_A_both
## 1  1  0  0  0 0 0.2996 0.1193595        0.7004     0.8806405 0.61680058
## 2  2  0  0  0 0 0.2996 0.1193595        0.7004     0.8806405 0.61680058
## 3  3  0  0  1 0 0.2996 0.1193595        0.7004     0.1193595 0.08359942
## 4  4  0  0  0 0 0.2996 0.1193595        0.7004     0.8806405 0.61680058
## 5  5  1  0  0 0 0.2996 0.4840426        0.2996     0.5159574 0.15458085
## 6  6  0  0  0 0 0.2996 0.1193595        0.7004     0.8806405 0.61680058
##        IPTW
## 1  1.621270
## 2  1.621270
## 3 11.961806
## 4  1.621270
## 5  6.469107
## 6  1.621270

Calculate the risk difference weighted by IPTW

#---Calculate the risk difference weighted by IPTW------####
IPTW_num= dat_A3 %>%  #numerator of the risk calculation
  filter(M==1) %>% #for numerator of risk calculation, limit to outcome=1.
  group_by(A1, A3) %>% #stratify by exposure status
  summarise(num = sum(IPTW*M)) #multiply the weights by the outcome and take sum

IPTW_num #print it
## # A tibble: 4 x 3
## # Groups:   A1 [?]
##      A1    A3    num
##   <int> <int>  <dbl>
## 1     0     0 1158  
## 2     0     1  143  
## 3     1     0  559  
## 4     1     1   88.5
IPTW_denom = dat_A3 %>%  #denominator of the risk calculation
  group_by(A1, A3) %>% 
  summarise(denom = sum(IPTW)) #sum the weights. 

IPTW_denom
## # A tibble: 4 x 3
## # Groups:   A1 [?]
##      A1    A3 denom
##   <int> <int> <dbl>
## 1     0     0 10000
## 2     0     1 10000
## 3     1     0 10000
## 4     1     1 10000
#link the numerator and denominator together and divide to get the weighted risks.
IPTW = IPTW_denom %>%
  left_join(IPTW_num, by = c("A1", "A3")) %>%
  mutate(R_IPTW = num/denom)

IPTW #print it
## # A tibble: 4 x 5
## # Groups:   A1 [2]
##      A1    A3 denom    num  R_IPTW
##   <int> <int> <dbl>  <dbl>   <dbl>
## 1     0     0 10000 1158   0.116  
## 2     0     1 10000  143   0.0143 
## 3     1     0 10000  559   0.0559 
## 4     1     1 10000   88.5 0.00885
(RD_IPTW = filter(IPTW, A1==1 & A3==1) -  filter(IPTW, A1==0 & A3==0))
##   A1 A3 denom      num    R_IPTW
## 1  1  1     0 -1070.01 -0.107001

The risk difference in the weighted dataset is -0.107001. An estimate of the difference in mortalty risk if physical activity were set to active at both time points in a target population compared with the mortality risk if physical activity were set to inactive at both time points in the same population, assuming exchangeability, consistency, and positivity.

Considerations
This does not apply stabilized weights.