X: treatment (categorical exposure)
Y: binary outcome
Z: confounding variable (categorical)
X -> Y;
X -> Z;
Z -> Y;
library(knitr)
library(tidyverse)
library(scales)
library(broom)
library(Epi)
invlogit <- function(x) exp(x)/(1 + exp(x))
theme_set(theme_minimal())
sim_data <- function(n, b0, bx, bz, bxz = 0, px = .5, pz = .5, c0 = -.7, cz = .7){
z <- sample(c(0, 1), n, prob = c(1 - pz, pz), replace = T)
x <- rbinom(n, size = 1, prob = invlogit(c0 + cz*z))
y <- rbinom(n, size = 1, prob = invlogit(b0 + bx*x + bz*z + bxz*x*z))
dat <- data.frame(
x = x,
z = z,
y = y
)
return(dat)
}
N <- 1000
set.seed(2204191) # seed for reproducibility
# simulate data
dat_notrteff <- lapply(1:N, function(i){
sim_data(n = 1000, b0 = -2, bx = 0, bz = 1.946)
})
# adjusted OR (Mantel-Haenszel)
adjor_notrteff <- lapply(dat_notrteff, function(dat){
mant_test <- mantelhaen.test(x = dat$x, y = dat$y, z = dat$z)
tidy(mant_test)
}) %>%
bind_rows()
graphical presentation
ggplot(adjor_notrteff, aes(estimate)) +
geom_histogram(aes(y = ..density..), alpha = 0.4, fill = "#F8766D") +
geom_line(stat = "density") +
scale_x_continuous(trans = "log", breaks = pretty_breaks()) +
labs(x = "Z-Adjusted Odds Ratio (ln)", y = "Empirical Distribution")
summary descriptive measures
quantile(adjor_notrteff$estimate, p = c(.025, .975)) %>%
kable(digits = 3, col.names = "Percentiles")
Percentiles | |
---|---|
2.5% | 0.740 |
97.5% | 1.387 |
set.seed(2204192) # seed for reproducibility
# simulate data
dat_bentrteff <- lapply(1:N, function(i){
sim_data(n = 1000, b0 = -2, bx = -.7, bz = 1.946)
})
# adjusted OR (Mantel-Haenszel)
adjor_bentrteff <- lapply(dat_bentrteff, function(dat){
mant_test <- mantelhaen.test(x = dat$x, y = dat$y, z = dat$z)
tidy(mant_test)
}) %>%
bind_rows()
graphical presentation
ggplot(adjor_bentrteff, aes(estimate)) +
geom_histogram(aes(y = ..density..), alpha = 0.4, fill = "#00BFC4") +
geom_line(stat = "density") +
scale_x_continuous(trans = "log", breaks = pretty_breaks()) +
labs(x = "Z-Adjusted Odds Ratio (ln)", y = "Empirical Distribution")
summary descriptive measures
quantile(adjor_bentrteff$estimate, p = c(.025, .975)) %>%
kable(digits = 3, col.names = "Percentiles")
Percentiles | |
---|---|
2.5% | 0.356 |
97.5% | 0.678 |
bind_rows(
mutate(adjor_notrteff, DGM = "H: OR_XY|Z = 1"),
mutate(adjor_bentrteff, DGM = "H: OR_XY|Z = .5")
) %>%
ggplot(aes(estimate)) +
geom_line(stat = "density", aes(col = DGM)) +
geom_vline(xintercept = c(.5, 1), linetype = "dashed") +
#geom_area(stat = "density", aes(fill = DGM), alpha = .3) +
scale_x_continuous(trans = "log", breaks = pretty_breaks()) +
labs(x = "Z-Adjusted Odds Ratio (ln)", y = "Empirical Distribution")
Type I error and power
data.frame(
"Type I error" = mean(adjor_notrteff$p.value <= .05),
"Power" = mean(adjor_bentrteff$p.value <= .05)
) %>%
kable()
Type.I.error | Power |
---|---|
0.05 | 0.994 |
set.seed(2204193) # seed for reproducibility
# simulate data
dat_bentrteff2 <- lapply(1:N, function(i){
sim_data(n = 1000, b0 = -2, bx = log(.8), bz = 1.946)
})
# adjusted OR (Mantel-Haenszel)
adjor_bentrteff2 <- lapply(dat_bentrteff2, function(dat){
mant_test <- mantelhaen.test(x = dat$x, y = dat$y, z = dat$z)
tidy(mant_test)
}) %>%
bind_rows()
bind_rows(
mutate(adjor_notrteff, DGM = "H: OR_XY|Z = 1"),
mutate(adjor_bentrteff2, DGM = "H: OR_XY|Z = .8")
) %>%
ggplot(aes(estimate)) +
geom_line(stat = "density", aes(col = DGM)) +
geom_vline(xintercept = c(.8, 1), linetype = "dashed") +
#geom_area(stat = "density", aes(fill = DGM), alpha = .3) +
scale_x_continuous(trans = "log", breaks = pretty_breaks()) +
labs(x = "Z-Adjusted Odds Ratio (ln)", y = "Empirical Distribution")
Type I error and power
data.frame(
"Type I error" = mean(adjor_notrteff$p.value <= .05),
"Power" = mean(adjor_bentrteff2$p.value <= .05)
) %>%
kable()
Type.I.error | Power |
---|---|
0.05 | 0.273 |
cases_list <- list(
"Case #1 Opposite exposure effects" = c(b0 = -2, bx = log(0.5), bz = 1.946, bxz = log(2/0.5)),
"Case #2 Similar beneficial exposure effects" = c(b0 = -2, bx = log(0.5), bz = 1.946, bxz = log(0.5/0.5)),
"Case #3 Beneficial exposure effect and no exposure effect" = c(b0 = -2, bx = log(0.5), bz = 1.946, bxz = log(1/0.5)),
"Case #4 No effects" = c(b0 = -2, bx = log(1), bz = 1.946, bxz = log(1/1))
)
set.seed(2204194) # seed for reproducibility
# simulate data
dat_caseslist <- lapply(cases_list, function(case){
lapply(1:N, function(i){
sim_data(n = 1000, b0 = case["b0"], bx = case["bx"], bz = case["bz"], bxz = case[["bxz"]])
})
})
# conditional ORs from a logist regression
condOR_caseslist <- lapply(dat_caseslist, function(case){
lapply(case, function(dat){
fit <- glm(y ~ x*z, data = dat, family = "binomial")
data.frame(
quantity = c("OR_Z0", "OR_Z1"),
OR = c(exp(coef(fit)["x"]), exp(coef(fit)["x"] + coef(fit)["x:z"]))
)
}) %>%
bind_rows(.id = "sim")
}) %>%
bind_rows(.id = "case")
ggplot(condOR_caseslist, aes(OR, col = quantity)) +
geom_line(stat = "density") +
facet_wrap(~ case, scales = "free") +
labs(x = "Odds Ratio (log)", y = "Empirical Distribution") +
theme(legend.position = "bottom")
cases_list2 <- list(
"Simulation #1 Very low power to detect a large discrepancy" =
c(n = 1000, b0 = -2, bx = log(1), bz = 1.946, bxz = log(1.5/1.0)),
"Simulation #2 Increasing power for Simulation #1" =
c(n = 10000, b0 = -2, bx = log(1), bz = 1.946, bxz = log(1.5/1.0)),
"Simulation #3 Very high power to detect a small discrepancy" =
c(n = 500000, b0 = -2, bx = log(1), bz = 1.946, bxz = log(1.05/1.0))
)
set.seed(2204195) # seed for reproducibility
N <- 100 # reducing computation times
# simulate data
dat_caseslist2 <- lapply(cases_list2, function(case){
lapply(1:N, function(i){
sim_data(n = case["n"], b0 = case["b0"], bx = case["bx"], bz = case["bz"],
bxz = case[["bxz"]], c0 = log(1), cz = 1)
})
})
# conditional ORs from a logist regression
# obs: it takes some time (~10 minutes)
condOR_caseslist2 <- lapply(dat_caseslist2, function(case){
lapply(case, function(dat){
fit <- glm(y ~ x*z, data = dat, family = "binomial")
data.frame(
quantity = c("OR_Z0", "OR_Z1"),
OR = c(exp(coef(fit)["x"]), exp(coef(fit)["x"] + coef(fit)["x:z"])),
pvalue_inter = c(tidy(fit)[4, "p.value", drop = T], NA)
)
}) %>%
bind_rows(.id = "sim")
}) %>%
bind_rows(.id = "case")
Graphical comparison
ggplot(condOR_caseslist2, aes(OR, col = quantity)) +
geom_line(stat = "density") +
facet_wrap(~ case, scales = "free", ncol = 2, nrow = 2) +
labs(x = "Odds Ratio (log)", y = "Empirical Distribution") +
theme(legend.position = "bottom")
power
filter(condOR_caseslist2, !is.na(pvalue_inter)) %>%
group_by(case) %>%
summarise(power = mean(pvalue_inter < 0.05)) %>%
kable()
case | power |
---|---|
Simulation #1 Very low power to detect a large discrepancy | 0.23 |
Simulation #2 Increasing power for Simulation #1 | 0.97 |
Simulation #3 Very high power to detect a small discrepancy | 0.91 |
library(vcd)
# selecting only one simulated dataset
dat <- dat_caseslist[[3]][[17]]
tab_dat <- xtabs(~ y + x + z, data = dat) # 3-way table
tab_dat
, , z = 0
x
y 0 1
0 298 154
1 42 11
, , z = 1
x
y 0 1
0 128 134
1 119 114
oddsratio(tab_dat, log = FALSE) # conditional odds-ratio
odds ratios for y and x by z
0 1
0.5068027 0.9150884
woolf_test(tab_dat) # Woolf-test
Woolf-test on Homogeneity of Odds Ratios (no 3-Way assoc.)
data: tab_dat
X-squared = 2.2237, df = 1, p-value = 0.1359