Load packages, create additional directories and define additional functions

library(haven)

## Warning: package 'haven' was built under R version 4.5.2

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tibble' was built under R version 4.5.2

## Warning: package 'tidyr' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## Warning: package 'purrr' was built under R version 4.5.2

## Warning: package 'dplyr' was built under R version 4.5.2

## Warning: package 'stringr' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(data.table)

## Warning: package 'data.table' was built under R version 4.5.2

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, isoyear, mday, minute, month, quarter, second, wday,
##     week, yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

library(readstata13)
setDTthreads(0)
getDTthreads(verbose = TRUE)

##   OpenMP version (_OPENMP)       201511
##   omp_get_num_procs()            8
##   R_DATATABLE_NUM_PROCS_PERCENT  unset (default 50)
##   R_DATATABLE_NUM_THREADS        unset
##   R_DATATABLE_THROTTLE           unset (default 1024)
##   omp_get_thread_limit()         2147483647
##   omp_get_max_threads()          8
##   OMP_THREAD_LIMIT               unset
##   OMP_NUM_THREADS                unset
##   RestoreAfterFork               true
##   data.table is using 8 threads with throttle==1024. See ?setDTthreads.

## [1] 8

library(SEQTaRget)
#remotes::install_github("CausalInference/SEQTaRget", subdir = "SEQTaRget")

Data preparation

setwd("C:/Users/agn21/OneDrive/Desktop/Msc Epidemiology/6. Causal Inference/Week 20/R")

dat <- read.dta13("msm_hiv_for_practical.dta")

## Warning in read.dta13("msm_hiv_for_practical.dta"): 
##    Factor codes of type double or float detected in variables
## 
##    age_grp, baseline_cd4_grp, baseline_rna_grp,
##    group, cd4_grp, rna_grp, lag_cd4_grp,
##    lag_rna_grp
## 
##    No labels have been assigned.
##    Set option 'nonint.factors = TRUE' to assign labels anyway.

head(dat)

##   id month aidsordeath haart age_grp baseline_cd4_grp baseline_rna_grp
## 1  1     1           0     0       2                5                2
## 2  1     2           0     0       2                5                2
## 3  1     3           0     0       2                5                2
## 4  1     4           0     0       2                5                2
## 5  1     5           0     0       2                5                2
## 6  1     6           0     0       2                5                2
##   baseline_bevent group cd4_grp rna_grp bevent lag_cd4_grp lag_rna_grp year
## 1               0     2       5       2      0           5           2 1999
## 2               0     2       5       2      0           5           2 1999
## 3               0     2       5       2      0           5           2 1999
## 4               0     2       5       2      0           5           2 1999
## 5               0     2       5       2      0           5           2 1999
## 6               0     2       5       2      0           5           2 1999
##   laghaart    timespl1    timespl2    timespl3
## 1        0    0.000000   0.0000000   0.0000000
## 2        0   -0.967033  -0.7362638  -0.2967033
## 3        0   -7.736264  -5.8901100  -2.3736265
## 4        0  -26.109890 -19.8791218  -8.0109892
## 5        0  -60.890110 -47.1208801 -18.9890118
## 6        0 -112.879120 -92.0329666 -37.0879135

dat <- dat |> arrange(id, month)

create dataset with formatted variables

dat <- dat |>
  transmute(
    id, 
    month_start = month - 1, # edited to start at month 0 as needed for SEQTaRget package
    month_end = month, 
    aidsordeath_int = as.integer(aidsordeath),
    haart_int = as.integer(haart),
    laghaart_int = as.integer(laghaart),
    age_grp_fct = as_factor(age_grp),
    group_fct = as_factor(group),
    baseline_cd4_grp_fct = as_factor(baseline_cd4_grp),
    baseline_rna_grp_fct = as_factor(baseline_rna_grp),
    baseline_bevent_int = as.integer(baseline_bevent),
    cd4_grp_fct = as_factor(cd4_grp),
    rna_grp_fct = as_factor(rna_grp),
    lag_cd4_grp_fct = as_factor(lag_cd4_grp),
    lag_rna_grp_fct = as_factor(lag_rna_grp),
    bevent_int = as.integer(bevent),
    year_fct = as_factor(year),
    # spline variables
    timespl1, timespl2, timespl3,
    lag_aidsordeath_int = as.integer(lag(aidsordeath_int)),
    # eligibility variable
    eligible = case_when(
      haart_int == 0 & lag_aidsordeath_int == 0 ~ 1,
      haart_int == 1 & laghaart_int == 0 & lag_aidsordeath_int == 0 ~ 1,
      TRUE ~ 0
    )
  )

Open the dataset of HIV infected individuals, with follow-up from 1996 to 2003..

The dataset has the following variables:

id - person id

month_start - start of the month of observation (since baseline - month 0)

month_end - end of the month of observation

aidsordeath_int - whether the person had developed AIDS or died by the end of the next month

haart_int - whether the person had initiated ART by the end of the current month

laghaart_int - whether the person had initiated ART by the end of the previous month

age_group_fct - baseline age group

group_fct - baseline transmission risk group

baseline_cd4_grp_fct - baseline CD4 group

baseline_rna_grp_fct - baseline plasma HIV-1 RNA group

baseline_bevent_int - CDC stage B event in the month before baseline

cd4_grp_fct - CD4 group measured at start of current month

rna_grp_fct - plasma HIV-1 RNA group measured at start of current month

lag_cd4_grp_fct - lagged (three months previous) CD4 group

lag_rna_grp_fct - lagged (three months previous) plasma HIV-1 RNA group

bevent_int - CDC stage B event during previous month

year_fct - current year

eligible - patients eligibility each month

# Explore the dataset
# Look at the data for an example patient

example_id <- 91
dat |>
  filter(id == example_id) |>
  select(c(id, month_start, month_end, aidsordeath_int, haart_int, rna_grp_fct))

##   id month_start month_end aidsordeath_int haart_int rna_grp_fct
## 1 91           0         1               0         0           3
## 2 91           1         2               0         0           3
## 3 91           2         3               0         0           3
## 4 91           3         4               0         0           3
## 5 91           4         5               0         0           4
## 6 91           5         6               0         1           4
## 7 91           6         7               0         1           4
## 8 91           7         8               1         1           4

Are they ever treated with ART? If yes, between month 5 - 6

Do they develop AIDS or die during follow-up? Died in month 8 - 9

Do any of the clinical variables update over time? yes, plasma

In the following exercises we will estimate the per-protocol effect of ART initiation on the outcome AIDS or death using a sequential trials approach.

# Create the sequential trials and estimate a hazard ratio for the per protocol effect of ART using pooled logistic regression, accounting for the informative censoring

# define the options used in the SEQuential function
opts_haz_ipcw <- SEQopts(
  hazard = TRUE,
  weighted = TRUE,
  weight.preexpansion = TRUE,
  weight.eligible_cols = list("eligible"),   # exclude already-treated rows
  bootstrap = TRUE,
  bootstrap.nboot = 20, 
  seed = 12345, 
  data.return = TRUE,
  selection.random = TRUE,
  selection.prob = 0.5
)

# Estimate the hazard ratio using informative censoring weights
model_haz_ipcw <- SEQuential(
  data = dat,
  id.col = "id",
  time.col = "month_start",
  eligible.col = "eligible",
  treatment.col = "haart_int",
  outcome.col = "aidsordeath_int",
  time_varying.cols =
    c("cd4_grp_fct", "rna_grp_fct", "lag_cd4_grp_fct",
    "lag_rna_grp_fct", "bevent_int", "year_fct"),
  fixed.cols =
    c("baseline_cd4_grp_fct", "baseline_rna_grp_fct",
    "baseline_bevent_int", "age_grp_fct", "group_fct"),
  method = "censoring",
  options = opts_haz_ipcw)

## Non-required columns provided, pruning for efficiency
## Pruned
## Expanding Data...
## Expansion Successful
## Moving forward with censoring analysis
## Bootstrapping with 80 % of data 20 times
## Completed

Explore the example patient in the sequential trials and compare to the unexpanded data

model_haz_ipcw@DT  |> 
  filter(id == example_id) |>
  arrange(id, trial, period) |> 
  select(id, trial, period, followup, haart_int, haart_int_bas, aidsordeath_int, weight) |>
  print()

##        id trial period followup haart_int haart_int_bas aidsordeath_int
##     <int> <num>  <int>    <int>    <fctr>        <fctr>           <int>
##  1:    91     0      0        0         0             0               0
##  2:    91     0      1        1         0             0               0
##  3:    91     0      2        2         0             0               0
##  4:    91     0      3        3         0             0               0
##  5:    91     0      4        4         0             0               0
##  6:    91     0      5        5         1             0              NA
##  7:    91     1      1        0         0             0               0
##  8:    91     1      2        1         0             0               0
##  9:    91     1      3        2         0             0               0
## 10:    91     1      4        3         0             0               0
## 11:    91     1      5        4         1             0              NA
## 12:    91     2      2        0         0             0               0
## 13:    91     2      3        1         0             0               0
## 14:    91     2      4        2         0             0               0
## 15:    91     2      5        3         1             0              NA
## 16:    91     3      3        0         0             0               0
## 17:    91     3      4        1         0             0               0
## 18:    91     3      5        2         1             0              NA
## 19:    91     4      4        0         0             0               0
## 20:    91     4      5        1         1             0              NA
## 21:    91     5      5        0         1             1               0
## 22:    91     5      6        1         1             1               0
## 23:    91     5      7        2         1             1               1
##        id trial period followup haart_int haart_int_bas aidsordeath_int
##     <int> <num>  <int>    <int>    <fctr>        <fctr>           <int>
##        weight
##         <num>
##  1: 1.0000000
##  2: 1.0782659
##  3: 1.1271890
##  4: 1.1758135
##  5: 1.3535174
##  6: 0.4097814
##  7: 1.0782659
##  8: 1.1271890
##  9: 1.1758135
## 10: 1.3535174
## 11: 0.4097814
## 12: 1.0453720
## 13: 1.0904671
## 14: 1.2552724
## 15: 0.3800374
## 16: 1.0431378
## 17: 1.2007901
## 18: 0.3635427
## 19: 1.1511327
## 20: 0.3485088
## 21: 0.3027529
## 22: 0.3027529
## 23: 0.3027529
##        weight
##         <num>

How many trials does the example patient contribute to? 6 How many trials are they in the control group 5 and how many are they in the treated group? 1

When is the patient artificially censored? 8th Month

Lets explore the models that have been fit to the data. Which covariates were used in the outcome model and the censoring models?

# what covariates were used in the censoring models (numerator and denominator) and the outcome model?
covariates(model_haz_ipcw)

## $Outcome
## [1] "aidsordeath_int ~ haart_int_bas + followup + followup_sq + trial + trial_sq + baseline_cd4_grp_fct + baseline_rna_grp_fct + baseline_bevent_int + age_grp_fct + group_fct"
## 
## $Numerator
## [1] "haart_int ~ baseline_cd4_grp_fct + baseline_rna_grp_fct + baseline_bevent_int + age_grp_fct + group_fct + month_start + month_start_sq"
## 
## $Denominator
## [1] "haart_int ~ baseline_cd4_grp_fct + baseline_rna_grp_fct + baseline_bevent_int + age_grp_fct + group_fct + cd4_grp_fct + rna_grp_fct + lag_cd4_grp_fct + lag_rna_grp_fct + bevent_int + year_fct + month_start + month_start_sq"

Create the sequential trials and estimate a hazard ratio for the per protocol effect of ART using pooled logistic regression, but without accounting for the informative censoring

# define the options used in the SEQuential function
opts_haz_unweighted <- SEQopts(
  hazard = TRUE,
  weighted = FALSE,
  bootstrap = TRUE,
  bootstrap.nboot = 20,
  seed = 1234, 
  selection.random = TRUE,
  selection.prob = 0.5
)

# Estimate the hazard ratio using informative censoring weights
model_haz_unweighted <- SEQuential(
  data = dat,
  id.col = "id",
  time.col = "month_start",
  eligible.col = "eligible",
  treatment.col = "haart_int",
  outcome.col = "aidsordeath_int",
  time_varying.cols =
    c("cd4_grp_fct", "rna_grp_fct", "lag_cd4_grp_fct",
    "lag_rna_grp_fct", "bevent_int", "year_fct"),
  fixed.cols =
    c("baseline_cd4_grp_fct", "baseline_rna_grp_fct",
    "baseline_bevent_int", "age_grp_fct", "group_fct"),
  method = "censoring",
  options = opts_haz_unweighted)

## Non-required columns provided, pruning for efficiency
## Pruned
## Expanding Data...
## Expansion Successful
## Moving forward with censoring analysis
## Bootstrapping with 80 % of data 20 times
## Completed

Create predicted cumulative incidence curves

opts_surv_ipcw <- SEQopts(
  km.curves = TRUE,
  weighted = TRUE,
  weight.preexpansion = TRUE,
  weight.eligible_cols = list("eligible"),   # exclude already-treated rows
  bootstrap = TRUE,
  bootstrap.nboot = 20,
  seed = 1234, 
  selection.random = TRUE,
  selection.prob = 0.5  
)

model_surv_ipcw <- SEQuential(
  data = dat,
  id.col = "id",
  time.col = "month_start",
  eligible.col = "eligible",
  treatment.col = "haart_int",
  outcome.col = "aidsordeath_int",
  time_varying.cols =
    c("cd4_grp_fct", "rna_grp_fct", "lag_cd4_grp_fct",
    "lag_rna_grp_fct", "bevent_int", "year_fct"),
  fixed.cols =
    c("baseline_cd4_grp_fct", "baseline_rna_grp_fct",
    "baseline_bevent_int", "age_grp_fct", "group_fct"),
  method = "censoring",
  options = opts_surv_ipcw)

## Non-required columns provided, pruning for efficiency
## Pruned
## Expanding Data...
## Expansion Successful
## Moving forward with censoring analysis
## Bootstrapping with 80 % of data 20 times
## censoring model created successfully
## Creating Survival curves

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

## Completed

km_curve(model_surv_ipcw, 
  plot.type = "risk",
  plot.labels = c("Untreated (no initiation)", "Treated (ART initiation)"),
  plot.title = "", 
  plot.subtitle = "Cumulative incidence for the outcome AIDs or death"
)

## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

## [[1]]

# risk at the end of follow up
risk_data(model_surv_ipcw)

## [[1]]
##       Method      A       Risk     95% LCI    95% UCI         SE
##       <char> <char>      <num>       <num>      <num>      <num>
## 1: censoring      0 0.40533007 0.166774615 0.64388552 0.12171420
## 2: censoring      1 0.03322166 0.005306122 0.06113719 0.01424288

# risk difference and risk ratio at the end of follow-up
risk_comparison(model_surv_ipcw)

## [[1]]
##       A_x    A_y  Risk Ratio RR 95% LCI RR 95% UCI Risk Differerence RD 95% LCI
##    <fctr> <fctr>       <num>      <num>      <num>             <num>      <num>
## 1: risk_0 risk_1  0.08196198 0.02938138  0.2286403        -0.3721084 -0.6122916
## 2: risk_1 risk_0 12.20077854 4.37368304 34.0351588         0.3721084  0.1319252
##    RD 95% UCI
##         <num>
## 1: -0.1319252
## 2:  0.6122916

Sequential Trials Practical: R solution

2026-03-09

Load packages, create additional directories and define additional functions

Data preparation

create dataset with formatted variables

Open the dataset of HIV infected individuals, with follow-up from 1996 to 2003..

id - person id

month_start - start of the month of observation (since baseline - month 0)

month_end - end of the month of observation

aidsordeath_int - whether the person had developed AIDS or died by the end of the next month

haart_int - whether the person had initiated ART by the end of the current month

laghaart_int - whether the person had initiated ART by the end of the previous month

age_group_fct - baseline age group

group_fct - baseline transmission risk group

baseline_cd4_grp_fct - baseline CD4 group

baseline_rna_grp_fct - baseline plasma HIV-1 RNA group

baseline_bevent_int - CDC stage B event in the month before baseline

cd4_grp_fct - CD4 group measured at start of current month

rna_grp_fct - plasma HIV-1 RNA group measured at start of current month

lag_cd4_grp_fct - lagged (three months previous) CD4 group

lag_rna_grp_fct - lagged (three months previous) plasma HIV-1 RNA group

bevent_int - CDC stage B event during previous month

year_fct - current year

eligible - patients eligibility each month

Are they ever treated with ART? If yes, between month 5 - 6

Do they develop AIDS or die during follow-up? Died in month 8 - 9

Do any of the clinical variables update over time? yes, plasma

In the following exercises we will estimate the per-protocol effect of ART initiation on the outcome AIDS or death using a sequential trials approach.

Explore the example patient in the sequential trials and compare to the unexpanded data

How many trials does the example patient contribute to? 6 How many trials are they in the control group 5 and how many are they in the treated group? 1

When is the patient artificially censored? 8th Month

Lets explore the models that have been fit to the data. Which covariates were used in the outcome model and the censoring models?

Create the sequential trials and estimate a hazard ratio for the per protocol effect of ART using pooled logistic regression, but without accounting for the informative censoring

Create predicted cumulative incidence curves