Q3- Now suppose the causal association between age and weight might be different for boys and girls. Use a single linear regression, with a categorical variable for sex, to estimate the total causal effect of age on weight separately for boys and girls. How do girls and boys differ? Provide one or more posterior contrasts as a summary.
library(ggdag)
dagify(
height ~ age,
weight ~ height,
weight ~ age,
height ~ sex,
weight ~ sex
) %>%
ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
geom_dag_point() +
geom_dag_edges() +
geom_dag_text() +
theme_dag()
Let’s take a look at the data:
data("Howell1")
df_kids <- Howell1 %>% filter(age < 13)
head(df_kids)
## height weight age male
## 1 121.92 19.61785 12.0 1
## 2 105.41 13.94795 8.0 0
## 3 86.36 10.48931 6.5 0
## 4 109.22 15.98912 7.0 0
## 5 114.30 17.86019 11.0 1
## 6 121.92 20.41164 8.0 1
We will modify our previous model:
prior <- alist(
weight ~ dnorm(mu, sigma),
mu <- a + b * age,
a ~ dunif(-5, 10),
b ~ dunif(0, 5),
sigma ~ dunif(0, 10)
)
model1 <- quap(prior, data = df_kids)
precis(model1)
## mean sd 5.5% 94.5%
## a 7.446058 0.36231834 6.867003 8.025113
## b 1.341747 0.05480708 1.254154 1.429339
## sigma 2.524138 0.14771394 2.288063 2.760214
Before we had a single a (intercept), and slope (b) for both sexes.
Thus a single value. Now we will have a vector like
c(0.5, 0.2) that will have a different effect on each
sex.
Take a look at our data. male is coded as 1 and 2. We
can recode it as sex (1 and 2). So a[1] and a[2] will refer to the first
and second index in our vector.
df_kids2 <- df_kids %>%
mutate(sex = male + 1)
head(df_kids2)
## height weight age male sex
## 1 121.92 19.61785 12.0 1 2
## 2 105.41 13.94795 8.0 0 1
## 3 86.36 10.48931 6.5 0 1
## 4 109.22 15.98912 7.0 0 1
## 5 114.30 17.86019 11.0 1 2
## 6 121.92 20.41164 8.0 1 2
And now we can modify our model:
prior <- alist(
weight ~ dnorm(mu, sigma),
mu <- a[sex] + b[sex] * age,
# I'll use Sergiu's priors
a[sex] ~ dunif(-5, 10), # intercepts
b[sex] ~ dunif(0, 5), # slopes
sigma ~ dunif(0, 10) # standard deviation
)
model1 <- quap(prior, data = df_kids2)
precis(model1, depth=2)
## mean sd 5.5% 94.5%
## a[1] 7.059033 0.47848761 6.294318 7.823749
## a[2] 7.886592 0.50670456 7.076780 8.696403
## b[1] 1.290131 0.07304516 1.173391 1.406871
## b[2] 1.388702 0.07592423 1.267360 1.510043
## sigma 2.423610 0.14183103 2.196937 2.650283
coefficients <- coef(model1)
intercepts <- coefficients[1:2]
slopes <- coefficients[3:4]
sex <- c(1,2)
df_coefs <- data.frame(intercepts, slopes, sex)
print(df_coefs)
## intercepts slopes sex
## a[1] 7.059033 1.290131 1
## a[2] 7.886592 1.388702 2
ggplot(df_kids2, aes(y=weight, x=age, color=as.factor(sex))) +
geom_point() +
geom_smooth(method = "lm", se=FALSE, lty=2) + ## Linear regression comparision
geom_abline(data=df_coefs, aes(intercept=intercepts, slope=slopes, color=as.factor(sex)))
## `geom_smooth()` using formula = 'y ~ x'
Here is the comparison with the OLS
model_lm <- lm(weight ~ age + as.factor(sex) + as.factor(sex) * age, data=df_kids2)
summary(model_lm)
##
## Call:
## lm(formula = weight ~ age + as.factor(sex) + as.factor(sex) *
## age, data = df_kids2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9592 -1.4223 0.0049 1.3415 7.9375
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.05904 0.48518 14.549 <2e-16 ***
## age 1.29013 0.07407 17.418 <2e-16 ***
## as.factor(sex)2 0.82756 0.70667 1.171 0.244
## age:as.factor(sex)2 0.09857 0.10683 0.923 0.358
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.458 on 142 degrees of freedom
## Multiple R-squared: 0.8194, Adjusted R-squared: 0.8156
## F-statistic: 214.8 on 3 and 142 DF, p-value: < 2.2e-16
age_seq <- 0:12
## simulate data
mu_male <- sim(model1, data=list(age=age_seq, sex=rep(2,13)))
mu_female <- sim(model1, data=list(age=age_seq, sex=rep(1,13)))
mu_contrast <- mu_male # just temporary. we will fill it later on.
for ( i in 1:13 ) {
mu_contrast[,i] <- mu_male[,i] - mu_female[,i]
}
## create an empty plot
plot( NULL , xlim=c(0,13) , ylim=c(-15,15) , xlab="age" ,
ylab="weight difference (boys-girls)" )
# for each confidence interval, draw a shade
for ( p in c(0.5,0.67,0.89,0.99) ) {
shade( apply(mu_contrast,2,PI,prob=p) , age_seq )
}
# put a line at 0
abline(h=0,lty=2,lwd=2)
The take home message of this plot is, looking at the intervals and concluding that it estimates to zero, just because it overlaps to zero doesn’t make sense.
df_sim_male <- data.frame(mu_male) %>%
gather(age, weight) %>%
mutate(sex="male") %>%
mutate(age=str_sub(age,2)) %>%
mutate(age=as.integer(age))
df_sim_female <- data.frame(mu_female) %>%
gather(age, weight) %>%
mutate(sex="female") %>%
mutate(age=str_sub(age,2)) %>%
mutate(age=as.integer(age))
df_sim <- bind_rows(df_sim_male, df_sim_female)
df_sim %>%
ggplot(aes(x=weight, fill=sex)) +
geom_density(alpha= 0.2) +
facet_wrap(.~age)