This instruction (of causal inference) is for the complex longitudinal data.
In detail please see the reference by Ashley I Naimi.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# data management or manipulate to generate lag variables
a <- read_csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\2023_04_21-time-dependent.csv") %>%
group_by(ID) %>%
# lag after group id
mutate(exposure_lag = lag(exposure, n = 1L,
default = 0), c1_lag = lag(c1, n = 1L,
default = 0), c2_lag = lag(c2, n = 1L,
default = 0)) %>%
ungroup()
## Rows: 3435 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): ID, Int, exposure, c1, c2, Y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
a
## # A tibble: 3,435 × 9
## ID Int exposure c1 c2 Y exposure_lag c1_lag c2_lag
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0 0 1 0 0 0 0
## 2 1 2 1 1 1 0 0 0 1
## 3 1 3 1 1 0 0 1 1 1
## 4 1 4 1 1 1 1 1 1 0
## 5 2 1 0 0 1 0 0 0 0
## 6 2 2 1 0 1 0 0 0 1
## 7 2 3 0 1 1 0 1 0 1
## 8 2 4 1 0 1 0 0 1 1
## 9 3 1 1 1 1 0 0 0 0
## 10 3 2 1 1 1 0 1 1 1
## # ℹ 3,425 more rows
calculate stabilized IP weights by ID
# numerator
num <- glm(exposure ~ factor(Int), data = a,
family = binomial("logit"))$fitted.values
# denominator
den <- glm(exposure ~ factor(Int) + exposure_lag +
c1 + c2 + c1_lag + c2_lag, data = a,
# using lag?
family = binomial("logit"))$fitted.values
# ip
a <- a %>%
mutate(sw_ = (num/den) * exposure + ((1 -
num)/(1 - den)) * (1 - exposure)) %>%
# cumprod ip
group_by(ID) %>%
mutate(sw = cumprod(sw_)) %>%
ungroup() %>%
select(-sw_)
# summary weights
a %>%
group_by(Int) %>%
# group by time (close to 1)
summarise(meanSW = mean(sw), maxSW = max(sw))
## # A tibble: 4 × 3
## Int meanSW maxSW
## <dbl> <dbl> <dbl>
## 1 1 0.997 1.60
## 2 2 0.994 2.65
## 3 3 0.988 3.87
## 4 4 0.993 5.79
a %>%
group_by(ID) %>%
# group by ID
summarise(meanSW = mean(sw), maxSW = max(sw))
## # A tibble: 1,228 × 3
## ID meanSW maxSW
## <dbl> <dbl> <dbl>
## 1 1 0.919 1.02
## 2 2 1.66 2.57
## 3 3 0.743 0.928
## 4 4 1.01 1.13
## 5 5 0.877 0.928
## 6 6 0.857 0.928
## 7 7 1.30 1.44
## 8 8 1.18 1.46
## 9 9 0.844 1.02
## 10 10 0.928 0.928
## # ℹ 1,218 more rows
use these weights to fit an IP weighted MSM
# aim to adjust the weight
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
modMSM <- lm(Y ~ factor(Int) + I(exposure/4), #why divided by 4, otherwise, coefficient *4
data = a, weights = sw)
coeftest(modMSM, vcov. = vcovCL(modMSM, cluster = a$ID,
type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1920508 0.0160912 11.9351 < 2e-16 ***
## factor(Int)2 0.0198237 0.0192788 1.0283 0.30390
## factor(Int)3 0.0303484 0.0222573 1.3635 0.17281
## factor(Int)4 -0.0092636 0.0241587 -0.3834 0.70141
## I(exposure/4) 0.1840595 0.0721127 2.5524 0.01074 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(ltmle)
# read in data
a <- read_csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\2023_04_21-time-dependent.csv")
## Rows: 3435 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): ID, Int, exposure, c1, c2, Y
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
a %>%
print(n = 3)
## # A tibble: 3,435 × 6
## ID Int exposure c1 c2 Y
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0 0 1 0
## 2 1 2 1 1 1 0
## 3 1 3 1 1 0 0
## # ℹ 3,432 more rows
# convert data from long to wide,data transform, without lag
# keep confounders
b <- a %>%
pivot_wider(names_from = Int, values_from = c(exposure,
c1, c2, Y)) %>%
mutate(Y_1 = if_else(is.na(Y_1), 1, Y_1),
Y_2 = if_else(is.na(Y_2), 1, Y_2),
Y_3 = if_else(is.na(Y_3), 1, Y_3),
Y_4 = if_else(is.na(Y_4), 1, Y_4)) %>%
select(exposure_1, c1_1, c2_1, Y_1, exposure_2,
c1_2, c2_2, Y_2, exposure_3, c1_3,
c2_3, Y_3, exposure_4, c1_4, c2_4,
Y_4)
head(b)
## # A tibble: 6 × 16
## exposure_1 c1_1 c2_1 Y_1 exposure_2 c1_2 c2_2 Y_2 exposure_3 c1_3
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0 1 0 1 1 1 0 1 1
## 2 0 0 1 0 1 0 1 0 0 1
## 3 1 1 1 0 1 1 1 0 1 1
## 4 0 1 1 0 0 1 1 0 1 1
## 5 1 1 1 0 1 0 1 1 NA NA
## 6 1 1 1 0 1 1 1 1 NA NA
## # ℹ 6 more variables: c2_3 <dbl>, Y_3 <dbl>, exposure_4 <dbl>, c1_4 <dbl>,
## # c2_4 <dbl>, Y_4 <dbl>
# ltmle
# super learner library, algorithm
sl.lib <- c("SL.mean",
"SL.glm",
"SL.ranger")
#ltmle
result <- ltmle(b,
Anodes=c(paste0("exposure_",1:4)), #treatment
Lnodes=c("c1_1", "c2_1", "c1_2", "c2_2", #
"c1_3", "c2_3", "c1_4", "c2_4"),
Ynodes=c("Y_1","Y_2","Y_3","Y_4"),
survivalOutcome = TRUE,
SL.library = list(Q = sl.lib, g = sl.lib), #g computation and ps?
abar=list(treament = c(1, 1, 1, 1),
control = c(0, 0, 0, 0)),
# estimate.time = T,
# need to comment out to run
stratify = T)
## Loading required namespace: SuperLearner
## Qform not specified, using defaults:
## formula for c1_1:
## Q.kplus1 ~ 1
## formula for c1_2:
## Q.kplus1 ~ c1_1 + c2_1
## formula for c1_3:
## Q.kplus1 ~ c1_1 + c2_1 + c1_2 + c2_2
## formula for c1_4:
## Q.kplus1 ~ c1_1 + c2_1 + c1_2 + c2_2 + c1_3 + c2_3
##
## gform not specified, using defaults:
## formula for exposure_1:
## exposure_1 ~ 1
## formula for exposure_2:
## exposure_2 ~ c1_1 + c2_1
## formula for exposure_3:
## exposure_3 ~ c1_1 + c2_1 + c1_2 + c2_2
## formula for exposure_4:
## exposure_4 ~ c1_1 + c2_1 + c1_2 + c2_2 + c1_3 + c2_3
##
## Loading required package: nnls
## Loading required namespace: ranger
## Error in pred[, "1"] : subscript out of bounds
## Timing estimate unavailable
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# summary(result)
summary(result)[[2]]$ATE
## $long.name
## [1] "Additive Treatment Effect"
##
## $estimate
## [1] 0.2276607
##
## $std.dev
## [1] 0.05489238
##
## $pvalue
## [1] 3.362715e-05
##
## $CI
## 2.5% 97.5%
## [1,] 0.1200736 0.3352477
##
## $log.std.err
## [1] FALSE
# result$fit$g
# result$fit$Q