library(tidyverse)
library(rethinking)
library(dagitty)
library(ggdag)

Pipe

# G = gender
# T = time spent studying
# R = exam results

n <- 500
df <- tibble(
  G = rbern( n , 0.5 ),
  T = rnorm( n , ifelse(G==0, 2, -2), 2 ),
  R = rnorm( n , T, 3 )
)

# naive plot
dens(df$R, 
     xlab="Exam results",
     ylim = c(0, max(density(df$R)$y) * 1.5)
)
lines(density(df$R[df$G == 0]), col="blue")
lines(density(df$R[df$G == 1]), col="red")
legend("topright",legend = c("Male", "Female"), col = c("blue", "red"), lwd = 2)

# causal model
dagify(
  R ~ T,
  T ~ G,
  outcome="R",
  exposure="G",
  latent="T"
) |>
  tidy_dagitty() |>
  node_status() |>
  ggplot(aes_dag(color=status)) + 
    geom_dag(size=2) +
    theme_dag_gray(legend.position="none",aspect.ratio=.66) +
    geom_dag_node(size=40) +
    geom_dag_label(aes(label = name), fill = "white", color="black", size=4) +
    scale_y_reverse(lim=c(-.5,2.5)) + coord_flip() + 
    scale_color_manual(values=c("exposure" = "#F8766D", "outcome" = "#619CFF", "latent"="lightgray"))

# interpreted plot
plot(R~T, data=df, col=c(2,4)[G+1], lwd=2)
abline(lm(R~T, data=df), lwd=2)
abline(lm(R[G==0]~T[G==0], data=df), col=2, lwd=2)
abline(lm(R[G==1]~T[G==1], data=df), col=4, lwd=2)

Fork

# A = age at marriage
# H = happiness
# G = gender

n <- 500
df <- tibble(
  G = rbern( n , 0.5 ),
  A = rnorm( n , 2*G-1 ),
  H = rnorm( n , 2*G-1 )
)

# naive plot
plot(H~A, data=df, col="#555", lwd=2)
abline(lm(H~A, data=df), lwd=3)

# causal model
dagify(
  A ~ G,
  H ~ G,
  exposure="G",
  outcome=c("H"),
  latent="A"
) |>
  tidy_dagitty() |>
  node_status() |>
  ggplot(aes_dag(color=status)) + 
    geom_dag(size=2) +
    theme_dag_gray(legend.position="none",aspect.ratio=.66) +
    geom_dag_node(size=40) +
    geom_dag_label(aes(label = name), fill = "white", color="black", size=4) +
    coord_cartesian(xlim = c(-1, 1), ylim = c(-0.5, 1.5)) +
    scale_color_manual(values=c("exposure" = "#F8766D", "outcome" = "#619CFF", "latent"="lightgray"))

# interpreted plot
plot(H~A, data=df, col=c(2,4)[G+1], lwd=2)
abline(lm(H~A, data=df), lwd=3)
abline(lm(H[G==0]~A[G==0], data=df), col=2, lwd=3)
abline(lm(H[G==1]~A[G==1], data=df), col=4, lwd=3)

Collider

# F = family wealth
# H = hardwork
# J = prestigeous job

n <- 500
df <- tibble(
  F = rnorm( n ),
  H = rnorm( n ),
  R = rnorm( n, 0, 0.3 ),
  J = ifelse(F+H+R > 0.5, TRUE, FALSE)
)

# naive plot
plot(H~F, data=df, lwd=2, col=ifelse(J, "royalblue", "transparent"))
abline(lm(H[J==TRUE]~F[J==TRUE], data=df), lwd=3, col="royalblue")

# causal model
dagify(
  J ~ F,
  J ~ H,
  exposure=c("F", "H"),
  latent=c("J")
) |>
  tidy_dagitty(layout = 'eigen') |>
  node_status() |>
  ggplot(aes_dag(color=status)) + 
    geom_dag(size=2) +
    theme_dag_gray(legend.position="none",aspect.ratio=.66) +
    geom_dag_node(size=40) +
    geom_dag_label(aes(label = name), fill = "white", color="black", size=4) +
    coord_cartesian(xlim = c(-1, 1), ylim = c(-1, 1)) +
    scale_color_manual(values=c("exposure" = "#F8766D", "outcome" = "#619CFF", "latent"="lightgray"))

## Warning in layout_with_eigen(graph, type = type, ev = eigenvector): g is
## directed. undirected version is used for the layout.

# interpreted plot
plot(H~F, data=df, lwd=2, col=ifelse(J, "royalblue", "darkgray"))
abline(lm(H[J]~F[J], data=df), lwd=3, col="royalblue")
abline(lm(H[!J]~F[!J], data=df), lwd=3, col="darkgray")
abline(lm(H~F, data=df), lwd=3, col="black")

Bonus: Operationalizing variables

library(dagitty)
library(ggdag)
library(haven)

S01 <- read_dta("nlss2022/data/stata format/S01.dta")
ggplot(S01, aes(x=q01_03)) +
  geom_histogram(binwidth=1, color="darkgray", fill="royalblue")

set.seed(10)
dagify(
  AgeEstimate ~ Age,
  Health ~ Age,
  Health ~ AgeEstimate,
  exposure=c("Age"),
  outcome=c("Health"),
  latent=c("AgeEstimate")
) |>
  tidy_dagitty() |>
  mutate(linetype = ifelse(name == "AgeEstimate" & to == "Health", "dashed", "solid")) |>
  node_status() |>
  ggplot(aes_dag(color=status, edge_linetype = linetype)) + 
    geom_dag(size=2.2) +
    theme_dag_gray(legend.position="none",aspect.ratio=.66) +
    geom_dag_node(size=40) +
    geom_dag_label(aes(label = name), fill = "white", color="black", size=4) +
    scale_x_continuous(expand = expansion(mult = 0.5)) +
    scale_y_continuous(expand = expansion(mult = 0.5)) + 
    scale_color_manual(values=c("exposure" = "#F8766D", "outcome" = "#619CFF", "latent"="lightgray"))

set.seed(NULL)

confounders

Pipe

Fork

Collider

Bonus: Operationalizing variables