What have we done:

Question for the week

Q3- Now suppose the causal association between age and weight might be different for boys and girls. Use a single linear regression, with a categorical variable for sex, to estimate the total causal effect of age on weight separately for boys and girls. How do girls and boys differ? Provide one or more posterior contrasts as a summary.

library(ggdag)
dagify(
  height ~ age,
  weight ~ height,
  weight ~ age,
  height ~ sex,
  weight ~ sex
  ) %>%
  ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_dag_point() + 
  geom_dag_edges() +
  geom_dag_text() +
  theme_dag()

Solution

Let’s take a look at the data:

data("Howell1")
df_kids <- Howell1 %>% filter(age < 13)
head(df_kids)
##   height   weight  age male
## 1 121.92 19.61785 12.0    1
## 2 105.41 13.94795  8.0    0
## 3  86.36 10.48931  6.5    0
## 4 109.22 15.98912  7.0    0
## 5 114.30 17.86019 11.0    1
## 6 121.92 20.41164  8.0    1

We will modify our previous model:

prior <-  alist(
  weight ~ dnorm(mu, sigma),
  mu <- a + b * age,
  a ~ dunif(-5, 10),
  b ~ dunif(0, 5),
  sigma ~ dunif(0, 10)
)
model1 <- quap(prior, data = df_kids)
precis(model1)
##           mean         sd     5.5%    94.5%
## a     7.446058 0.36231834 6.867003 8.025113
## b     1.341747 0.05480708 1.254154 1.429339
## sigma 2.524138 0.14771394 2.288063 2.760214

Before we had a single a (intercept), and slope (b) for both sexes. Thus a single value. Now we will have a vector like c(0.5, 0.2) that will have a different effect on each sex.

Take a look at our data. male is coded as 1 and 2. We can recode it as sex (1 and 2). So a[1] and a[2] will refer to the first and second index in our vector.

df_kids2 <- df_kids %>% 
  mutate(sex = male + 1) 

head(df_kids2)
##   height   weight  age male sex
## 1 121.92 19.61785 12.0    1   2
## 2 105.41 13.94795  8.0    0   1
## 3  86.36 10.48931  6.5    0   1
## 4 109.22 15.98912  7.0    0   1
## 5 114.30 17.86019 11.0    1   2
## 6 121.92 20.41164  8.0    1   2

And now we can modify our model:

prior <-  alist(
  weight ~ dnorm(mu, sigma),
  mu <- a[sex] + b[sex] * age,
  # I'll use Sergiu's priors
  a[sex] ~ dunif(-5, 10), # intercepts
  b[sex] ~ dunif(0, 5),  # slopes
  sigma ~ dunif(0, 10) # standard deviation
)

model1 <- quap(prior, data = df_kids2)
precis(model1, depth=2)
##           mean         sd     5.5%    94.5%
## a[1]  7.059033 0.47848761 6.294318 7.823749
## a[2]  7.886592 0.50670456 7.076780 8.696403
## b[1]  1.290131 0.07304516 1.173391 1.406871
## b[2]  1.388702 0.07592423 1.267360 1.510043
## sigma 2.423610 0.14183103 2.196937 2.650283
coefficients <- coef(model1)

intercepts <- coefficients[1:2]
slopes <- coefficients[3:4]
sex <- c(1,2)

df_coefs <- data.frame(intercepts, slopes, sex)
print(df_coefs)
##      intercepts   slopes sex
## a[1]   7.059033 1.290131   1
## a[2]   7.886592 1.388702   2
ggplot(df_kids2, aes(y=weight, x=age, color=as.factor(sex))) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE, lty=2) + ## Linear regression comparision
  geom_abline(data=df_coefs, aes(intercept=intercepts, slope=slopes, color=as.factor(sex)))
## `geom_smooth()` using formula = 'y ~ x'

Comparison with the linear model

Here is the comparison with the OLS

model_lm <- lm(weight ~ age + as.factor(sex) + as.factor(sex) * age, data=df_kids2)
summary(model_lm)
## 
## Call:
## lm(formula = weight ~ age + as.factor(sex) + as.factor(sex) * 
##     age, data = df_kids2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9592 -1.4223  0.0049  1.3415  7.9375 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.05904    0.48518  14.549   <2e-16 ***
## age                  1.29013    0.07407  17.418   <2e-16 ***
## as.factor(sex)2      0.82756    0.70667   1.171    0.244    
## age:as.factor(sex)2  0.09857    0.10683   0.923    0.358    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.458 on 142 degrees of freedom
## Multiple R-squared:  0.8194, Adjusted R-squared:  0.8156 
## F-statistic: 214.8 on 3 and 142 DF,  p-value: < 2.2e-16

Lets contrast it with age

age_seq <- 0:12

## simulate data
mu_male <- sim(model1, data=list(age=age_seq, sex=rep(2,13)))
mu_female <- sim(model1, data=list(age=age_seq, sex=rep(1,13)))


mu_contrast <- mu_male # just temporary. we will fill it later on.

for ( i in 1:13 ) {
  mu_contrast[,i] <- mu_male[,i] - mu_female[,i]
}

## create an empty plot
plot( NULL , xlim=c(0,13) , ylim=c(-15,15) , xlab="age" ,
ylab="weight difference (boys-girls)" )

# for each confidence interval, draw a shade
for ( p in c(0.5,0.67,0.89,0.99) ) {
shade( apply(mu_contrast,2,PI,prob=p) , age_seq )
}
# put a line at 0
abline(h=0,lty=2,lwd=2)

The take home message of this plot is, looking at the intervals and concluding that it estimates to zero, just because it overlaps to zero doesn’t make sense.

df_sim_male <- data.frame(mu_male) %>% 
  gather(age, weight) %>% 
  mutate(sex="male") %>% 
  mutate(age=str_sub(age,2)) %>% 
  mutate(age=as.integer(age)) 

df_sim_female <- data.frame(mu_female)  %>% 
  gather(age, weight) %>% 
  mutate(sex="female") %>% 
  mutate(age=str_sub(age,2)) %>% 
  mutate(age=as.integer(age)) 

df_sim <- bind_rows(df_sim_male, df_sim_female)  
  
df_sim %>% 
  ggplot(aes(x=weight, fill=sex)) +
  geom_density(alpha= 0.2) +
  facet_wrap(.~age)