Longitudinal Targeted Maximum Likelihood Estimation

LMTLE: A Brief Introduction

This instruction (of causal inference) is for the complex longitudinal data.

In detail please see the reference by Ashley I Naimi.

Constructing stabilized IP weights

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# data management or manipulate to generate lag variables
a <- read_csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\2023_04_21-time-dependent.csv") %>% 
group_by(ID) %>%
  
# lag after group id
mutate(exposure_lag = lag(exposure, n = 1L,
default = 0), c1_lag = lag(c1, n = 1L,
default = 0), c2_lag = lag(c2, n = 1L,
default = 0)) %>%
ungroup()

## Rows: 3435 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): ID, Int, exposure, c1, c2, Y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## # A tibble: 3,435 × 9
##       ID   Int exposure    c1    c2     Y exposure_lag c1_lag c2_lag
##    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl>        <dbl>  <dbl>  <dbl>
##  1     1     1        0     0     1     0            0      0      0
##  2     1     2        1     1     1     0            0      0      1
##  3     1     3        1     1     0     0            1      1      1
##  4     1     4        1     1     1     1            1      1      0
##  5     2     1        0     0     1     0            0      0      0
##  6     2     2        1     0     1     0            0      0      1
##  7     2     3        0     1     1     0            1      0      1
##  8     2     4        1     0     1     0            0      1      1
##  9     3     1        1     1     1     0            0      0      0
## 10     3     2        1     1     1     0            1      1      1
## # ℹ 3,425 more rows

calculate stabilized IP weights by ID

# numerator
num <- glm(exposure ~ factor(Int), data = a,
family = binomial("logit"))$fitted.values
# denominator
den <- glm(exposure ~ factor(Int) + exposure_lag +
c1 + c2 + c1_lag + c2_lag, data = a,
# using lag?
  
family = binomial("logit"))$fitted.values

# ip
a <- a %>%
mutate(sw_ = (num/den) * exposure + ((1 -
num)/(1 - den)) * (1 - exposure)) %>%
  
# cumprod ip  
group_by(ID) %>%
mutate(sw = cumprod(sw_)) %>%
  
ungroup() %>%
select(-sw_)

# summary weights
a %>%
group_by(Int) %>%
# group by time (close to 1) 
summarise(meanSW = mean(sw), maxSW = max(sw))

## # A tibble: 4 × 3
##     Int meanSW maxSW
##   <dbl>  <dbl> <dbl>
## 1     1  0.997  1.60
## 2     2  0.994  2.65
## 3     3  0.988  3.87
## 4     4  0.993  5.79

a %>%
group_by(ID) %>%
# group by ID
summarise(meanSW = mean(sw), maxSW = max(sw))

## # A tibble: 1,228 × 3
##       ID meanSW maxSW
##    <dbl>  <dbl> <dbl>
##  1     1  0.919 1.02 
##  2     2  1.66  2.57 
##  3     3  0.743 0.928
##  4     4  1.01  1.13 
##  5     5  0.877 0.928
##  6     6  0.857 0.928
##  7     7  1.30  1.44 
##  8     8  1.18  1.46 
##  9     9  0.844 1.02 
## 10    10  0.928 0.928
## # ℹ 1,218 more rows

use these weights to fit an IP weighted MSM

# aim to adjust the weight
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(sandwich)

modMSM <- lm(Y ~ factor(Int) + I(exposure/4),  #why divided by 4, otherwise, coefficient *4
data = a, weights = sw)

coeftest(modMSM, vcov. = vcovCL(modMSM, cluster = a$ID,
type = "HC3"))

## 
## t test of coefficients:
## 
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.1920508  0.0160912 11.9351  < 2e-16 ***
## factor(Int)2   0.0198237  0.0192788  1.0283  0.30390    
## factor(Int)3   0.0303484  0.0222573  1.3635  0.17281    
## factor(Int)4  -0.0092636  0.0241587 -0.3834  0.70141    
## I(exposure/4)  0.1840595  0.0721127  2.5524  0.01074 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Longitudinal Targeted Maximum Likelihood Estimation

library(ltmle)

# read in data
a <- read_csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\2023_04_21-time-dependent.csv")

## Rows: 3435 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): ID, Int, exposure, c1, c2, Y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

a %>%
print(n = 3)

## # A tibble: 3,435 × 6
##      ID   Int exposure    c1    c2     Y
##   <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl>
## 1     1     1        0     0     1     0
## 2     1     2        1     1     1     0
## 3     1     3        1     1     0     0
## # ℹ 3,432 more rows

# convert data from long to wide,data transform, without lag
# keep confounders
b <- a %>%
pivot_wider(names_from = Int, values_from = c(exposure,
c1, c2, Y)) %>%
mutate(Y_1 = if_else(is.na(Y_1), 1, Y_1),
Y_2 = if_else(is.na(Y_2), 1, Y_2),
Y_3 = if_else(is.na(Y_3), 1, Y_3),
Y_4 = if_else(is.na(Y_4), 1, Y_4)) %>%
select(exposure_1, c1_1, c2_1, Y_1, exposure_2,
c1_2, c2_2, Y_2, exposure_3, c1_3,
c2_3, Y_3, exposure_4, c1_4, c2_4,
Y_4)

head(b)

## # A tibble: 6 × 16
##   exposure_1  c1_1  c2_1   Y_1 exposure_2  c1_2  c2_2   Y_2 exposure_3  c1_3
##        <dbl> <dbl> <dbl> <dbl>      <dbl> <dbl> <dbl> <dbl>      <dbl> <dbl>
## 1          0     0     1     0          1     1     1     0          1     1
## 2          0     0     1     0          1     0     1     0          0     1
## 3          1     1     1     0          1     1     1     0          1     1
## 4          0     1     1     0          0     1     1     0          1     1
## 5          1     1     1     0          1     0     1     1         NA    NA
## 6          1     1     1     0          1     1     1     1         NA    NA
## # ℹ 6 more variables: c2_3 <dbl>, Y_3 <dbl>, exposure_4 <dbl>, c1_4 <dbl>,
## #   c2_4 <dbl>, Y_4 <dbl>

# ltmle
# super learner library, algorithm
sl.lib <- c("SL.mean",
"SL.glm",
"SL.ranger")

#ltmle
result <- ltmle(b,
Anodes=c(paste0("exposure_",1:4)),   #treatment
Lnodes=c("c1_1", "c2_1", "c1_2", "c2_2",  #
"c1_3", "c2_3", "c1_4", "c2_4"),
Ynodes=c("Y_1","Y_2","Y_3","Y_4"),

survivalOutcome = TRUE,
SL.library = list(Q = sl.lib, g = sl.lib), #g computation and ps?

abar=list(treament = c(1, 1, 1, 1),
control = c(0, 0, 0, 0)),
# estimate.time = T,
# need to comment out to run
stratify = T)

## Loading required namespace: SuperLearner

## Qform not specified, using defaults:

## formula for c1_1:

## Q.kplus1 ~ 1

## formula for c1_2:

## Q.kplus1 ~ c1_1 + c2_1

## formula for c1_3:

## Q.kplus1 ~ c1_1 + c2_1 + c1_2 + c2_2

## formula for c1_4:

## Q.kplus1 ~ c1_1 + c2_1 + c1_2 + c2_2 + c1_3 + c2_3

##

## gform not specified, using defaults:

## formula for exposure_1:

## exposure_1 ~ 1

## formula for exposure_2:

## exposure_2 ~ c1_1 + c2_1

## formula for exposure_3:

## exposure_3 ~ c1_1 + c2_1 + c1_2 + c2_2

## formula for exposure_4:

## exposure_4 ~ c1_1 + c2_1 + c1_2 + c2_2 + c1_3 + c2_3

##

## Loading required package: nnls

## Loading required namespace: ranger

## Error in pred[, "1"] : subscript out of bounds

## Timing estimate unavailable

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# summary(result)
summary(result)[[2]]$ATE

## $long.name
## [1] "Additive Treatment Effect"
## 
## $estimate
## [1] 0.2276607
## 
## $std.dev
## [1] 0.05489238
## 
## $pvalue
## [1] 3.362715e-05
## 
## $CI
##           2.5%     97.5%
## [1,] 0.1200736 0.3352477
## 
## $log.std.err
## [1] FALSE

# result$fit$g
# result$fit$Q

Longitudinal Targeted Maximum Likelihood Estimation

2023-05-08

LMTLE: A Brief Introduction

Constructing stabilized IP weights

Longitudinal Targeted Maximum Likelihood Estimation