confounders

Show setup code
library(tidyverse)
library(rethinking)
library(dagitty)
library(ggdag)

draw_dag <- function(df) {
  df |>
    ggplot(aes(
      x = x,
      y = y,
      xend = xend,
      yend = yend,
      color = status,
      edge_linetype = linetype,
      edge_color = linecolor
    )) +
    geom_dag_edges(
      start_cap = ggraph::circle(20, 'mm'),
      end_cap = ggraph::circle(20, 'mm'),
      edge_width = 1,
      arrow_directed = grid::arrow(
        length = grid::unit(12, "pt"),
        type = "open"
      ),
      arrow_bidirected = grid::arrow(
        length = grid::unit(24, "pt"),
        type = "open",
        ends = "both"
      )
    ) +
    geom_dag_node(size = 40) +
    geom_dag_label(
      aes(label = name),
      fill = "white",
      color = "black",
      size = 4
    ) +
    theme_dag_gray(legend.position = "none", aspect.ratio = .66) +
    scale_x_continuous(expand = expansion(mult = 0.5)) +
    scale_y_continuous(expand = expansion(mult = 0.5)) +
    scale_color_manual(
      values = c(
        "exposure" = "#F8766D",
        "outcome" = "#619CFF",
        "latent" = "lightgray"
      )
    )
}

Pipe

Show data generator code
# G = gender
# T = time spent studying
# R = exam results
set.seed(100)
n <- 500
df <- tibble(
  G = rbern(n, 0.5),
  T = rnorm(n, ifelse(G == 0, 2, -2), 2),
  R = rnorm(n, T, 3)
)

Naive analysis

Show code for naive visualization
dens(df$R, xlab = "Exam results", ylim = c(0, max(density(df$R)$y) * 1.5))
lines(density(df$R[df$G == 0]), col = "blue")
lines(density(df$R[df$G == 1]), col = "red")
legend(
  "topright",
  legend = c("Male", "Female"),
  col = c("blue", "red"),
  lwd = 2
)

Causal Model

Show DAG code
set.seed(105)
dagify(
  StudyTime ~ Gender,
  Result ~ StudyTime,
  Result ~ Gender,
  outcome = "Result",
  exposure = "Gender",
  latent = "StudyTime"
) |>
  tidy_dagitty() |>
  node_status() |>
  mutate(
    linetype = ifelse(name == "Gender" & to == "Result", "52", "solid"),
    linecolor = ifelse(linetype == "solid", "black", "darkgray")
  ) |>
  draw_dag()

Reinterpretation

Show code for interpreted visualization
plot(
  R ~ T,
  xlab = "Time Spent Studying",
  ylab = "Exam Results",
  data = df,
  col = c(4, 2)[G + 1],
  lwd = 2
)
legend(
  "bottomright",
  legend = c("Male", "Female"),
  col = c("blue", "red"),
  lwd = 2
)
abline(lm(R ~ T, data = df), lwd = 2)
abline(lm(R[G == 0] ~ T[G == 0], data = df), col = 4, lwd = 2)
abline(lm(R[G == 1] ~ T[G == 1], data = df), col = 2, lwd = 2)

Fork

Show data generator code
# A = age at marriage
# H = happiness
# G = gender
set.seed(116)
n <- 500
df <- tibble(
  G = rbern(n, 0.5),
  A = rnorm(n, 2 * G - 1),
  H = rnorm(n, 2 * G - 1)
)

Naive analysis

Show code for naive visualization
plot(
  H ~ A,
  data = df,
  col = "#555",
  lwd = 2,
  xlab = "Age at Marriage",
  ylab = "Happiness"
)
abline(lm(H ~ A, data = df), lwd = 3)

Causal model

Show DAG code
set.seed(101)
dagify(
  AgeAtMarriage ~ Gender,
  Happiness ~ Gender,
  Happiness ~ AgeAtMarriage,
  exposure = "AgeAtMarriage",
  outcome = "Happiness",
  latent = "Gender"
) |>
  tidy_dagitty() |>
  node_status() |>
  mutate(
    linetype = ifelse(
      name == "AgeAtMarriage" & to == "Happiness",
      "52",
      "solid"
    ),
    linecolor = ifelse(linetype == "solid", "black", "darkgray")
  ) |>
  draw_dag()

Reinterpretation

Show code for interpreted visualization
# interpreted plot
plot(
  H ~ A,
  data = df,
  col = c(2, 4)[G + 1],
  lwd = 2,
  xlab = "Age at Marriage",
  ylab = "Happiness"
)
abline(lm(H ~ A, data = df), lwd = 3)
abline(lm(H[G == 0] ~ A[G == 0], data = df), col = 2, lwd = 3)
abline(lm(H[G == 1] ~ A[G == 1], data = df), col = 4, lwd = 3)
legend(
  "bottomright",
  legend = c("Male", "Female"),
  col = c("blue", "red"),
  lwd = 2
)

Collider

Show data generator code
# F = family wealth
# H = hardwork
# J = prestigeous job

n <- 500
df <- tibble(
  F = rnorm(n),
  H = rnorm(n),
  R = rnorm(n, 0, 0.3),
  J = ifelse(F + H + R > 0.5, TRUE, FALSE)
)

Naive analysis

Show code for naive visualization
plot(
  H ~ F,
  data = df,
  lwd = 2,
  col = ifelse(J, "royalblue", "transparent"),
  xlab = "Family Wealth",
  ylab = "Hardwork"
)
abline(lm(H[J == TRUE] ~ F[J == TRUE], data = df), lwd = 3, col = "royalblue")

Causal Model

Show DAG code
set.seed(11243)
dagify(
  FamilyWealth ~ Hardwork,
  JobStatus ~ FamilyWealth,
  JobStatus ~ Hardwork,
  exposure = "Hardwork",
  outcome = "FamilyWealth",
  latent = "JobStatus"
) |>
  tidy_dagitty() |>
  node_status() |>
  mutate(
    linetype = ifelse(
      name == "Hardwork" & to == "FamilyWealth",
      "52",
      "solid"
    ),
    linecolor = ifelse(linetype == "solid", "black", "darkgray")
  ) |>
  draw_dag()

Reinterpretation

Show code for interpreted visualization
plot(
  H ~ F,
  data = df,
  lwd = 2,
  col = ifelse(J, "royalblue", "darkgray"),
  xlab = "Family Wealth",
  ylab = "Hardwork"
)
abline(lm(H[J] ~ F[J], data = df), lwd = 3, col = "royalblue")
abline(lm(H[!J] ~ F[!J], data = df), lwd = 3, col = "darkgray")
abline(lm(H ~ F, data = df), lwd = 3, col = "black")
legend(
  "topright",
  legend = c("Job", "No Job"),
  col = c("blue", "darkgray"),
  lwd = 2
)

Operationalization

Show R code
library(dagitty)
library(ggdag)
library(haven)

S01 <- read_dta("nlss2022/data/stata format/S01.dta")
ggplot(S01, aes(x = q01_03)) +
  geom_histogram(binwidth = 1, color = "darkgray", fill = "royalblue")

Show R code
set.seed(10)
dagify(
  AgeEstimate ~ Age,
  Health ~ Age,
  Health ~ AgeEstimate,
  exposure = c("Age"),
  outcome = c("Health"),
  latent = c("AgeEstimate")
) |>
  tidy_dagitty() |>
  mutate(
    linetype = ifelse(name == "AgeEstimate" & to == "Health", "dashed", "solid")
  ) |>
  node_status() |>
  ggplot(aes_dag(color = status, edge_linetype = linetype)) +
  geom_dag(size = 2.2) +
  theme_dag_gray(legend.position = "none", aspect.ratio = .66) +
  geom_dag_node(size = 40) +
  geom_dag_label(aes(label = name), fill = "white", color = "black", size = 4) +
  scale_x_continuous(expand = expansion(mult = 0.5)) +
  scale_y_continuous(expand = expansion(mult = 0.5)) +
  scale_color_manual(
    values = c(
      "exposure" = "#F8766D",
      "outcome" = "#619CFF",
      "latent" = "lightgray"
    )
  )

Show R code
set.seed(NULL)