IPW and propsensity scores

Methods for handling baseline confounding (IPW and propsensity scores)

library(haven) # read in .dta files

## Warning: package 'haven' was built under R version 4.5.2

library(labelled) # .dta variable classes

## Warning: package 'labelled' was built under R version 4.5.2

library(sandwich) # robust SEs

## Warning: package 'sandwich' was built under R version 4.5.2

library(parameters) # robust SEs using tidy_robust() in gtsummary::tbl_regression()

## Warning: package 'parameters' was built under R version 4.5.2

library(gtsummary) # nicely formatted tables

## Warning: package 'gtsummary' was built under R version 4.5.2

library(tidyverse) # data manipulation

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tibble' was built under R version 4.5.2

## Warning: package 'tidyr' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## Warning: package 'purrr' was built under R version 4.5.2

## Warning: package 'dplyr' was built under R version 4.5.2

## Warning: package 'stringr' was built under R version 4.5.2

## Warning: package 'lubridate' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# define a function to return an odds ratio with 95% CI limits using a robust SE
# from an input model object from a glm() function call
robres <- function(mod) {
  cov <- vcovHC(mod, type = "HC0")
  stderr <- sqrt(diag(cov))
  z <- coef(mod) / stderr
  pval <- 2 * pnorm(-1 * abs(z))
  qval <- qnorm(.975)
  results <- round(cbind(
    OR = exp(coef(mod)),
    LL = exp(coef(mod) - qval * stderr),
    UL = exp(coef(mod) + qval * stderr)
  ), 2)
  cbind(results, pval)
}

Part 1: Methods for handling baseline confounding In this section of the practical we will compare the use of standard regression models, propensity scores and inverse-probability weights to control confounding by indication (confounding in comparisons of different drug regimens because doctors use prognostic factors to choose regimens). We will use the dataset called artcc_propensity.dta. Our data are taken from cohort studies of HIV- infected individuals, who followed one of two regimens (A or B), coded in variable drugb. Our aim is to understand whether the two regimens were associated with different probabilities of progression to AIDS and/or death (variable aidsordeath). Open the dataset and understand the different variables and what type of data they contain.

setwd("C:/Users/agn21/OneDrive/Desktop/Msc Epidemiology/6. Causal Inference/Week 14/R")

dat2 <- read_dta("artcc_propensity.dta")

dat2 <- dat2 %>% mutate(
  female_fct = as_factor(female),
  idu_fct = as_factor(idu),
  stagec_fct = as_factor(stagec),
  cd4_cat_fct = as_factor(cd4_cat),
  drugb_fct = as_factor(drugb),
  aidsordeath_fct = as_factor(aidsordeath),
  female = unlabelled(female),
  idu = unlabelled(idu),
  stagec = unlabelled(stagec),
  drugb = unclass(drugb)
)

dat2 %>%
  select(!c(id, cohort, female, idu, stagec, cd4_cat, drugb, aidsordeath)) %>%
  tbl_summary()

Characteristic	N = 4,791¹
Year of starting HAART
2001	2,540 (53%)
2002	2,251 (47%)
Age of patient	39 (33, 47)
Square root CD4 cell count nearest to the date of starting therapy (cells/mL)	15 (11, 18)
Log 10 HIV-1 RNA nearest to the date of starting therapy (copies/ml)	4.85 (4.38, 5.28)
rna_cat
0	611 (13%)
10000	2,290 (48%)
1e+05	1,890 (39%)
age_cat
16	689 (14%)
30	1,778 (37%)
40	1,410 (29%)
50	914 (19%)
age2	1,521 (1,089, 2,209)
Female indicator, 1=female
male	3,660 (76%)
female	1,131 (24%)
Presumed risk from IDU, 1=yes
Non-IDU	4,046 (84%)
IDU	745 (16%)
CDC stage C, 0=A/B 1=C
A/B	3,865 (81%)
C	926 (19%)
cd4_cat_fct
0	661 (14%)
50	408 (8.5%)
100	1,002 (21%)
200	1,643 (34%)
350	1,077 (22%)
drugb_fct
A	1,915 (40%)
B	2,876 (60%)
aidsordeath_fct
0	4,449 (93%)
1	342 (7.1%)
¹ n (%); Median (Q1, Q3)

Examine how many people took the two regimens and how many went on to have AIDS and/or die

dat2 %>%
  select(drugb, aidsordeath) %>%
  tbl_cross(label = list(drugb ~ "Drug B", aidsordeath ~ "Aids or Death"),
            percent = "row")

	Aids or Death		Total
	0	1	Total
Drug B
0	1,802 (94%)	113 (5.9%)	1,915 (100%)
1	2,647 (92%)	229 (8.0%)	2,876 (100%)
Total	4,449 (93%)	342 (7.1%)	4,791 (100%)

dat2 %>%
  select(drugb, aidsordeath) %>%
  tbl_cross(label = list(drugb ~ "Drug B", aidsordeath ~ "Aids or Death"),
            percent = "column")

	Aids or Death		Total
	0	1	Total
Drug B
0	1,802 (41%)	113 (33%)	1,915 (40%)
1	2,647 (59%)	229 (67%)	2,876 (60%)
Total	4,449 (100%)	342 (100%)	4,791 (100%)

dat2 %>%
  select(drugb, aidsordeath) %>%
  tbl_cross(label = list(drugb ~ "Drug B", aidsordeath ~ "Aids or Death"),
            percent = "cell")

	Aids or Death		Total
	0	1	Total
Drug B
0	1,802 (38%)	113 (2.4%)	1,915 (40%)
1	2,647 (55%)	229 (4.8%)	2,876 (60%)
Total	4,449 (93%)	342 (7.1%)	4,791 (100%)

Because HIV doctors use prognostic factors to choose treatment regimens, we expect that there will be confounding (common causes of the drug regimen and the outcome). The following prognostic variables were recorded in the dataset. • female Female indicator, 1=female • idu Presumed HIV transmission from injection drug use, 1=yes • stagec CDC stage C, 0=A/B 1=C • haartyr Year of starting HAART, 2001/2002 • age Age of patient, years • Age2 Squared age of patient, years • cd4_t0_sqrt Square root CD4 cell count nearest to the date of starting therapy (cells/mL) • rna_log10 Log 10 HIV-1 RNA (copies/ml) nearest to the date of starting therapy 1. Standard approach Use the standard approach to the control of confounding: fit logistic regression models to compare the crude and adjusted effects of drug B (compared with drug A) on the odds of AIDS or death. For example:

logreg1 <-
  glm(aidsordeath ~ drugb_fct, data = dat2, family = binomial)
logreg1 %>% tbl_regression(exp = TRUE, show_single_row = "drugb_fct")

Characteristic	OR	95% CI	p-value
drugb_fct	1.38	1.10, 1.75	0.007
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

logreg2 <-
  glm(
    aidsordeath ~ drugb_fct + cd4_t0_sqrt + age + age2 + haartyr + rna_log10 + 
      female_fct + idu_fct + stagec_fct,
    data = dat2,
    family = binomial
  )

# Note the crude and adjusted odds ratios (with 95% CI) for the effect of drug B.

logreg2 %>% tbl_regression(exp = TRUE, 
                           show_single_row = c("drugb_fct", 
                                               "female_fct", 
                                               "idu_fct", 
                                               "stagec_fct"))

Characteristic	OR	95% CI	p-value
drugb_fct	0.97	0.76, 1.25	0.8
Square root CD4 cell count nearest to the date of starting therapy (cells/mL)	0.89	0.87, 0.91	<0.001
Age of patient	1.09	1.02, 1.17	0.014
age2	1.00	1.00, 1.00	0.11
Year of starting HAART	0.83	0.66, 1.05	0.12
Log 10 HIV-1 RNA nearest to the date of starting therapy (copies/ml)	1.06	0.89, 1.28	0.5
Female indicator, 1=female	0.86	0.63, 1.16	0.3
Presumed risk from IDU, 1=yes	1.18	0.86, 1.59	0.3
CDC stage C, 0=A/B 1=C	1.66	1.29, 2.15	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Propensity scores We will now derive the propensity score by fitting the logistic regression for the probability of receiving drug B (compared with drug A). For example:

ps <-
  glm(
    drugb ~ cd4_t0_sqrt + age + age2 + haartyr + rna_log10 + female_fct + 
      idu_fct + stagec_fct,
    data = dat2,
    family = binomial
  )
ps %>% tbl_regression(exp = TRUE,
                      show_single_row = c("female_fct", 
                                          "idu_fct", 
                                          "stagec_fct"))

Characteristic	OR	95% CI	p-value
Square root CD4 cell count nearest to the date of starting therapy (cells/mL)	0.96	0.95, 0.97	<0.001
Age of patient	1.03	0.99, 1.06	0.2
age2	1.00	1.00, 1.00	0.2
Year of starting HAART	0.72	0.64, 0.81	<0.001
Log 10 HIV-1 RNA nearest to the date of starting therapy (copies/ml)	1.50	1.37, 1.64	<0.001
Female indicator, 1=female	0.67	0.58, 0.78	<0.001
Presumed risk from IDU, 1=yes	0.71	0.61, 0.84	<0.001
CDC stage C, 0=A/B 1=C	1.24	1.05, 1.46	0.014
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

The propensity score is the predicted probability of receiving drug B. We can put this in a new object called p using the fitted.values() function:

dat2$p <- fitted.values(ps)

Examine separate histograms of the distribution of p in individuals who received drug A and drug B:

dat2 %>%
  ggplot(aes(x = p, fill = drugb_fct)) +
  geom_histogram(alpha = 0.6, stat = "density")

## Warning in geom_histogram(alpha = 0.6, stat = "density"): Ignoring unknown
## parameters: `binwidth` and `bins`

# Now we will control for confounding by including the propensity score in a logistic regression model and make a note of the propensity score adjusted odds ratio (with 95% CI) for the effect of drug B:


logreg3 <- glm(aidsordeath ~ drugb_fct + p, data = dat2, family = binomial)
logreg3 %>% tbl_regression(exp = TRUE, show_single_row = "drugb_fct")

Characteristic	OR	95% CI	p-value
drugb_fct	0.97	0.76, 1.24	0.8
p	558	190, 1,677	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

To avoid linearity assumptions we may wish to model categories defined by quintiles of the propensity score.

dat2 <- dat2 %>%
  mutate(qpscore = as_factor(ntile(p, 5)))
logreg4 <- glm(aidsordeath ~ drugb_fct + qpscore, data = dat2, family = binomial)

logreg4 %>% tbl_regression(
  exp = TRUE,
  show_single_row = "drugb_fct",
  intercept = TRUE
)

Characteristic	OR	95% CI	p-value
(Intercept)	0.03	0.02, 0.04	<0.001
drugb_fct	0.98	0.77, 1.26	0.9
qpscore
1	—	—
2	1.34	0.79, 2.28	0.3
3	2.11	1.31, 3.49	0.003
4	3.47	2.22, 5.61	<0.001
5	7.09	4.64, 11.3	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Inverse probability weighting We will also conduct an inverse-probability-of-treatment weighted analysis. The probability of the treatment an individual actually received is: • p if the individual received drug B • 1 - p if the individual received drug A

dat2 <- dat2 %>%
  mutate(ptrtrec = case_when(
    drugb == 1 ~ p,
    drugb == 0 ~ 1 - p
  ))

Now generate the IPT weight:

dat2 <- dat2 %>%
  mutate(iptweight = 1 / ptrtrec)

and use the IPT weight to control confounding:

logreg5 <-
  glm(
    aidsordeath ~ drugb_fct,
    data = dat2,
    family = binomial,
    weights = iptweight
  )

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

# logreg5 %>% tbl_regression(exp = TRUE, show_single_row = "drugb_fct")

Note the OR and 95% CI.

robres(logreg5)

##               OR   LL   UL          pval
## (Intercept) 0.08 0.06 0.09 1.447630e-139
## drugb_fctB  0.99 0.78 1.27  9.484363e-01

logreg5 %>%
  tbl_regression(tidy_fun = partial(tidy_robust, vcov = "HC0"),
                 exponentiate = TRUE)

Characteristic	OR	95% CI	p-value
drugb_fct
A	—	—
B	0.99	0.78, 1.27	>0.9
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Part 2: Investigating the effect of “super vitamins” on aids or death in the HIV cohort In a hypothetical example, a researcher speculated that a popular new brand of “super vitamins” may be beneficial in people with HIV infection and reduce the likelihood of progression to aids or death. The variable vitamins has been added to the artcc dataset and indicates whether individuals have taken “super vitamins” supplementation or not. Open the dataset artcc_vitamins.dta and investigate this hypothesis using: 1. The standard approach 2. Propensity scores 3. IP weighting

dat3 <- read_dta("artcc_vitamins.dta")

dat3 <- dat3 %>% mutate(
  female_fct = as_factor(female),
  idu_fct = as_factor(idu),
  stagec_fct = as_factor(stagec),
  cd4_cat_fct = as_factor(cd4_cat),
  rna_cat_fct = as_factor(rna_cat),
  age_cat_fct = as_factor(age_cat),
  drugb_fct = as_factor(drugb),
  aidsordeath_fct = as_factor(aidsordeath),
  haartyr_fct = as_factor(haartyr),
  vitamins_fct = as_factor(vitamins),
  female = unlabelled(female),
  idu = unlabelled(idu),
  stagec = unlabelled(stagec),
  drugb = unclass(drugb)
)

dat3 %>%
  select(!c(id, cohort, female, idu, stagec, cd4_cat, rna_cat, age_cat, drugb, aidsordeath, haartyr, vitamins)) %>%
  tbl_summary()

Characteristic	N = 4,791¹
Age of patient	39 (33, 47)
Square root CD4 cell count nearest to the date of starting therapy (cells/mL)	15 (11, 18)
Log 10 HIV-1 RNA nearest to the date of starting therapy (copies/ml)	4.85 (4.38, 5.28)
age2	1,521 (1,089, 2,209)
Female indicator, 1=female
male	3,660 (76%)
female	1,131 (24%)
Presumed risk from IDU, 1=yes
Non-IDU	4,046 (84%)
IDU	745 (16%)
CDC stage C, 0=A/B 1=C
A/B	3,865 (81%)
C	926 (19%)
cd4_cat_fct
0	661 (14%)
50	408 (8.5%)
100	1,002 (21%)
200	1,643 (34%)
350	1,077 (22%)
rna_cat_fct
0	611 (13%)
10000	2,290 (48%)
1e+05	1,890 (39%)
age_cat_fct
16	689 (14%)
30	1,778 (37%)
40	1,410 (29%)
50	914 (19%)
drugb_fct
A	1,915 (40%)
B	2,876 (60%)
aidsordeath_fct
0	4,449 (93%)
1	342 (7.1%)
haartyr_fct
2001	2,540 (53%)
2002	2,251 (47%)
vitamins_fct
0	4,251 (89%)
1	540 (11%)
¹ Median (Q1, Q3); n (%)

dat3 %>%
  select(vitamins_fct, aidsordeath_fct) %>%
  tbl_cross(
    label = list(vitamins_fct ~ "Vitamins", aidsordeath_fct ~ "Aids or Death"),
    percent = "row"
  )

	Aids or Death		Total
	0	1	Total
Vitamins
0	3,936 (93%)	315 (7.4%)	4,251 (100%)
1	513 (95%)	27 (5.0%)	540 (100%)
Total	4,449 (93%)	342 (7.1%)	4,791 (100%)

dat3 %>%
  select(vitamins_fct, aidsordeath_fct) %>%
  tbl_cross(
    label = list(vitamins_fct ~ "Vitamins", aidsordeath_fct ~ "Aids or Death"),
    percent = "column"
  )

	Aids or Death		Total
	0	1	Total
Vitamins
0	3,936 (88%)	315 (92%)	4,251 (89%)
1	513 (12%)	27 (7.9%)	540 (11%)
Total	4,449 (100%)	342 (100%)	4,791 (100%)

dat3 %>%

select(vitamins_fct, aidsordeath_fct) %>%
  tbl_cross(
    label = list(vitamins_fct ~ "Vitamins", aidsordeath_fct ~ "Aids or Death"),
    percent = "cell"
  )

	Aids or Death		Total
	0	1	Total
Vitamins
0	3,936 (82%)	315 (6.6%)	4,251 (89%)
1	513 (11%)	27 (0.6%)	540 (11%)
Total	4,449 (93%)	342 (7.1%)	4,791 (100%)

lr1 <- glm(aidsordeath ~ vitamins_fct, data = dat3, family = binomial)
lr1 %>% tbl_regression(exp = TRUE, show_single_row = "vitamins_fct")

Characteristic	OR	95% CI	p-value
vitamins_fct	0.66	0.43, 0.97	0.042
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

lr2 <-
  glm(
    aidsordeath ~ vitamins_fct + cd4_t0_sqrt + age + age2 +
      haartyr_fct + rna_log10 + female_fct + idu_fct + stagec_fct,
    data = dat3,
    family = binomial
  )
lr2 %>% tbl_regression(
  exp = TRUE,
  show_single_row = c(
    "vitamins_fct",
    "haartyr_fct",
    "female_fct",
    "idu_fct",
    "stagec_fct"
  )
)

Characteristic	OR	95% CI	p-value
vitamins_fct	1.36	0.87, 2.07	0.2
Square root CD4 cell count nearest to the date of starting therapy (cells/mL)	0.89	0.87, 0.91	<0.001
Age of patient	1.10	1.03, 1.18	0.009
age2	1.00	1.00, 1.00	0.079
haartyr_fct	0.85	0.67, 1.07	0.2
Log 10 HIV-1 RNA nearest to the date of starting therapy (copies/ml)	1.07	0.89, 1.28	0.5
Female indicator, 1=female	0.86	0.63, 1.15	0.3
Presumed risk from IDU, 1=yes	1.20	0.88, 1.62	0.2
CDC stage C, 0=A/B 1=C	1.68	1.30, 2.17	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

lr3 <- glm(vitamins ~ cd4_t0_sqrt + age + age2 + haartyr_fct +
             rna_log10 + female_fct + idu_fct + stagec_fct, 
           data = dat3, family = binomial)
lr3 %>% tbl_regression(
  exp = TRUE,
  show_single_row = c(
    "haartyr_fct",
    "female_fct",
    "idu_fct",
    "stagec_fct"
  )
)

Characteristic	OR	95% CI	p-value
Square root CD4 cell count nearest to the date of starting therapy (cells/mL)	1.05	1.03, 1.07	<0.001
Age of patient	0.81	0.77, 0.86	<0.001
age2	1.00	1.00, 1.00	<0.001
haartyr_fct	0.49	0.40, 0.59	<0.001
Log 10 HIV-1 RNA nearest to the date of starting therapy (copies/ml)	0.71	0.61, 0.82	<0.001
Female indicator, 1=female	1.53	1.24, 1.88	<0.001
Presumed risk from IDU, 1=yes	0.44	0.31, 0.61	<0.001
CDC stage C, 0=A/B 1=C	0.56	0.40, 0.78	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

dat3$p <- fitted.values(lr3)
lr4 <- glm(aidsordeath ~ vitamins_fct + p, data = dat3, family = binomial)
lr4 %>% tbl_regression(exp = TRUE, show_single_row = "vitamins_fct")

Characteristic	OR	95% CI	p-value
vitamins_fct	1.46	0.93, 2.20	0.083
p	0.00	0.00, 0.00	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

dat3 <- dat3 %>%
  mutate(qpscore = as_factor(ntile(p, 5)))

lr5 <- glm(aidsordeath ~ vitamins_fct + qpscore, data = dat3, family = binomial)
lr5 %>% tbl_regression(exp = TRUE, show_single_row = "vitamins_fct")

Characteristic	OR	95% CI	p-value
vitamins_fct	1.42	0.90, 2.16	0.11
qpscore
1	—	—
2	0.62	0.47, 0.83	0.001
3	0.35	0.25, 0.49	<0.001
4	0.24	0.16, 0.35	<0.001
5	0.10	0.06, 0.17	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

dat3 %>%
  ggplot(aes(x = p, fill = vitamins_fct)) +
  geom_histogram(alpha = 0.6, bins = 50, stat = "density")

## Warning in geom_histogram(alpha = 0.6, bins = 50, stat = "density"): Ignoring
## unknown parameters: `binwidth` and `bins`

dat3 <- dat3 %>%
  mutate(ptrtrec = case_when(
    vitamins == 1 ~ p,
    vitamins == 0 ~ 1 - p
  ))

dat3 <- dat3 %>%
  mutate(iptweight = 1 / ptrtrec)
lr6 <-
  glm(
    aidsordeath ~ vitamins_fct,
    data = dat3,
    family = binomial,
    weights = iptweight
  )

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

# lr6 %>% tbl_regression(exp = TRUE, show_single_row = "vitamins_fct")
robres(lr6)

##                 OR   LL   UL        pval
## (Intercept)   0.07 0.07 0.08 0.000000000
## vitamins_fct1 3.08 1.64 5.79 0.000471851

lr6 %>%
  tbl_regression(tidy_fun = partial(tidy_robust, vcov = "HC0"),
                 exponentiate = TRUE)

Characteristic	OR	95% CI	p-value
vitamins_fct
0	—	—
1	3.08	1.64, 5.79	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

summary(dat3$iptweight)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.004   1.043   1.086   1.980   1.209 136.647

IPW and propsensity scores

2026-03-14