This problem set is due on February 21, 2022 at 11:59am.
From the Howell1 dataset, consider only the people younger than 13 years old. Estimate the causal association between age and weight. Assume that age influences weight through two paths. First, age influences height, and height influences weight. Second, age directly influences weight through age related changes in muscle growth and body proportions. All of this implies this causal model (DAG):
library(dagitty)
g <- dagitty('dag {
bb="0,0,1,1"
A [pos="0.251,0.481"]
H [pos="0.350,0.312"]
W [pos="0.452,0.484"]
A -> H
A -> W
H -> W
}
')
plot(g)
Use a linear regression to estimate the total (not just direct) causal effect of each year of growth on weight. Be sure to carefully consider the priors. Try using prior predictive simulation to assess what they imply
library(rethinking)
## Loading required package: parallel
## rethinking (Version 2.13.2)
##
## Attaching package: 'rethinking'
## The following object is masked from 'package:stats':
##
## rstudent
data("Howell1")
d <- Howell1[Howell1$age<13,]
mod1 <- quap ( alist (
weight ~ dnorm(m,s),
m <- a + b * age,
a ~ dlnorm(1,5),
b ~ dlnorm(0,1),
s ~ exp(1)
), data = d)
precis(mod1)
## mean sd 5.5% 94.5%
## a 7.442333 0.39079789 6.817762 8.066903
## b 1.341089 0.05909368 1.246646 1.435532
priors <- extract.prior(mod1)
post <- extract.samples(mod1)
plot(d$age , d$weight , lwd=3, col='blue')
for ( i in 1:5 ) abline( post$a[i] , post$b[i] , lwd=3 , col='green')
for ( i in 1:10 ) abline( priors$a[i] , priors$b[i] , lwd=3 , col='red' )
Now suppose the causal association between age and weight might be different for boys and girls. Use a single linear regression, with a categorical variable for sex, to estimate the total causal effect of age on weight separately for boys and girls. How do girls and boys differ? Provide one or more posterior contrasts as a summary.
#dat$sex = dat$male + 1
dat <- list(W=d$weight, A=d$age,S=d$male+1)
# mod2 <- quap ( alist(
# W ~ dnorm (m, s),
# m <- a + b1 * A + b2 * S + b3 * S * A,
# a ~ dlnorm(1,5),
# b1 ~ dlnorm(0,1),
# b2 ~ dnorm(0,1),
# b3 ~ dnorm(0,1),
# s ~ exp(1)
# ), data = dat2)
mod3 <- quap ( alist(
W ~ dnorm (m, sigma),
m <- a[S] + b[S]*A,
a[S] ~ dnorm(1,5),
b[S] ~ dlnorm(0,1),
sigma ~ exp(1)
), data = dat)
precis(mod3)
priors <- extract.prior(mod3)
post <- extract.samples(mod3)
#sq <- seq(from=0, to = 12, length.out = 13)
sq <- 0:12
muF <- link(mod3, data=list(A=sq, S=rep(1,13)))
muM <- link(mod3, data=list(A=sq, S=rep(2,13)))
plot(d$age, d$weight, lwd=3, col=ifelse(d$male==1,4,2), xlim=c(0,12),ylim=c(0,30))
shade(apply(muF,2,PI,0.99), sq, col=col.alpha(2,0.5))
lines( sq, apply(muF, 2, mean), lwd=3, col=2)
library(rethinking)
data(Howell1)
d <- Howell1
d <- d[ d$age < 13 , ]
dat <- list(W=d$weight,A=d$age,S=d$male+1)
m3 <- quap(
alist(
W ~ dnorm( mu , sigma ),
mu <- a[S] + b[S]*A,
a[S] ~ dnorm(5,1),
b[S] ~ dlnorm(0,1),
sigma ~ dexp(1)
), data=dat )
# blank(bty="n")
plot( d$age , d$weight , lwd=3, col=ifelse(d$male==1,4,2) , xlab="age (years)" , ylab="weight (kg)" )
Aseq <- 0:12
# girls
muF <- link(m3,data=list(A=Aseq,S=rep(1,13)))
shade( apply(muF,2,PI,0.99) , Aseq , col=col.alpha(2,0.5) )
lines( Aseq , apply(muF,2,mean) , lwd=3 , col=2 )
# boys
muM <- link(m3,data=list(A=Aseq,S=rep(2,13)))
shade( apply(muM,2,PI,0.99) , Aseq , col=col.alpha(4,0.5) )
lines( Aseq , apply(muM,2,mean) , lwd=3 , col=4 )
mu_line <- apply(mu,2,mean)
plot(dat$age , dat$weight , lwd=3, col=dat$male+4) #Males are the darker blue dots, females are the lighter blue dots
for ( i in 1:10 ) {
abline( priors$a[i], priors$b1[i] , lwd=3 , col= i )
abline( priors$a[i] + priors$b2, priors$b1[i] + priors$b3[i], lwd=3 , col= i )
} #gives two lines of the same color for every prior simulation, with one being male, the other female
for ( i in 1:5 ) lines(sq, mu_line, lwd=2, col='green')
n_sim = 100
muF_sim <- sim(m3,data=list(A=Aseq,S=rep(1,13)), n_sim)
muM_sim <- sim(m3,data=list(A=Aseq,S=rep(2,13)), n_sim)
mu_contrast <- muF_sim
for (i in 1:13) mu_contrast[,i] <- muM_sim[,i] - muF_sim[,i]
plot(NULL, xlim=c(0,13), ylim=c(-15,15), xlab="age", ylab="weight differnce (boy-girls)")
for (p in c(0.5, 0.67, 0.89, 0.99))
shade( apply(mu_contrast,2,PI,prob=p), Aseq)
abline(h=0,lty=2,lwd=2)
for (i in 1:13) points( rep(i-1,n_sim), mu_contrast[1:n_sim,i], col=ifelse(mu_contrast[1:n_sim,i]>0,4,2), lwd=3)
For this problem, we want to compare the difference between Frequentist and Bayesian linear regressions. We’re going to use the similar functions from section 4.5.
To begin, I have provided the same code to get the model m4.5 (i.e., run R code 4.65). Please run it and refresh yourself.
# R code 4.65 + 4.66 (precis)
library(rethinking)
data(Howell1)
d <- Howell1
d$weight_s <- ( d$weight - mean(d$weight) )/sd(d$weight)
d$weight_s2 <- d$weight_s^2
m4.5 <- quap(
alist(
height ~ dnorm( mu , sigma ) ,
mu <- a + b1*(weight_s) + b2 * weight_s2,
a ~ dnorm( 178 , 20 ) ,
b1 ~ dlnorm( 0 , 1 ) ,
b2 ~ dnorm( 0, 1 ) ,
sigma ~ dunif( 0 , 50 )
) , data=d )
precis( m4.5 )
## mean sd 5.5% 94.5%
## a 146.057414 0.3689755 145.467720 146.647108
## b1 21.733062 0.2888889 21.271361 22.194762
## b2 -7.803268 0.2741838 -8.241467 -7.365069
## sigma 5.774473 0.1764650 5.492447 6.056498
Now modify m4.5 model by relaxing our “positive relationship” (aka lognormal) assumption for the b1 variable by modifying it’s prior as dnorm( 0 , 1 ) and create a new model called m4.5b. Run precis(m4.5b).
m4.5b <- quap(
alist(
height ~ dnorm( mu , sigma ) ,
mu <- a + b1*(weight_s) + b2 * weight_s2,
a ~ dnorm( 178 , 20 ) ,
b1 ~ dnorm(1 , exp(1) ) ,
b2 ~ dnorm( 0, 1 ) ,
sigma ~ dunif( 0 , 50 )
) , data=d )
precis( m4.5b )
## mean sd 5.5% 94.5%
## a 146.162029 0.3678600 145.574117 146.749940
## b1 21.517291 0.2864453 21.059496 21.975086
## b2 -7.908414 0.2728626 -8.344502 -7.472327
## sigma 5.769437 0.1760331 5.488102 6.050772
Now, run a frequentist regression of m4.5b by using the lm function. I have provided this code.
# hint: you need to only remove the eval=FALSE so this code runs
fm <- lm(height ~ weight_s + weight_s2, data = d)
names(fm$coefficients) <- c('a','b1','b2') # rename coef for consistency
fm
##
## Call:
## lm(formula = height ~ weight_s + weight_s2, data = d)
##
## Coefficients:
## a b1 b2
## 146.660 21.415 -8.412
Now compare all three models by using the coeftab() and putting all three of the models as parameters. You can also run a plot() on the coeftab() function to run a plot of the effects.
# type in your code here
coeftab(m4.5,m4.5b,fm)
## m4.5 m4.5b fm
## a 146.06 146.16 146.66
## b1 21.73 21.52 21.41
## b2 -7.80 -7.91 -8.41
## sigma 5.77 5.77 NA
## nobs 544 544 544
plot(coeftab(m4.5,m4.5b,fm))
How different are the models?
m4.5 and fm provide nearly identical estimates for the equation of the line (not withstanding that one has a sigma). m4.5b gives a greater slope, with both a higher b1 and b2 values. This can most likely be explained by negative values for b1 by nature of being normal potentially skewing the data, even when negative b1 would be nonsensical. The sigmas of m4.5 and m4.5b do indicate comparable levels of precision, leaving an ambiguous validity to m4.5b.
For this problem, we’re going to reuse the same model (m4.5) from Question 3 and run prior predictive simulations to understand the role of different priors. For help, see 5.4-5.5 code in the book.
d <- Howell1
N=42
a = rnorm(N, 178 , 20 )
b1 = rlnorm(N, 0 , 1 )
b2 = rnorm(N, 0, 1 )
d$weight_s <- ( d$weight - mean(d$weight) )/sd(d$weight)
d$weight_s2 <- d$weight_s^2
plot( NULL , xlim=range(d$weight) , ylim=c(-100,400) ,
xlab="weight" , ylab="height" )
mtext( "b ~ dnorm(0,1)" )
xbar <- mean(d$weight)
for ( i in 1:N ) curve( a[i] + b1[i]*(x - xbar)*d$weight_s[i] + b2[i]*(x-xbar)*d$weight_s2[i],
from=min(d$weight) , to=max(d$weight) , add=TRUE ,
col=i)
Change the priors on the b2 coefficient to b2 ~ dnorm(0, 10) and rerun the prior predictive simulation.
b2 <- rnorm(N,0,10)
plot( NULL , xlim=range(d$weight) , ylim=c(-100,400) ,
xlab="weight" , ylab="height" )
mtext( "b ~ dnorm(0,10)" )
for ( i in 1:N ) curve( a[i] + b1[i]*(x - xbar)*d$weight_s[i] + b2[i]*(x-xbar)*d$weight_s2[i],
from=min(d$weight) , to=max(d$weight) , add=TRUE ,
col=i)
Now, change the priors on the beta coefficients to more “flat, very uninformative” priors, dnorm(0, 100) for b1 and b2. Rerun a similar prior predictive simulation.
b2 <- rnorm(N,0,100)
b1 <- rnorm(N,0,100)
plot( NULL , xlim=range(d$weight) , ylim=c(-100,400) ,
xlab="weight" , ylab="height" )
mtext( "b ~ dnorm(0,10)" )
for ( i in 1:N ) curve( a[i] + b1[i]*(x - xbar)*d$weight_s[i] + b2[i]*(x-xbar)*d$weight_s2[i],
from=min(d$weight) , to=max(d$weight) , add=TRUE ,
col=i)
Return to data(cherry_blossoms) and model the association between blossom date (day) and March temperature (temp). Note that there are many missing values in both variables. You may consider a linear model, a polynomial, or a spline on temperature. How well does temperature trend predict the blossom trend?