Problem Set 3

This problem set is due on February 21, 2022 at 11:59am.

Name:
UNCC ID:
Other student worked with (optional):

Question 1

From the Howell1 dataset, consider only the people younger than 13 years old. Estimate the causal association between age and weight. Assume that age influences weight through two paths. First, age influences height, and height influences weight. Second, age directly influences weight through age related changes in muscle growth and body proportions. All of this implies this causal model (DAG):

library(dagitty)

g <- dagitty('dag {
bb="0,0,1,1"
A [pos="0.251,0.481"]
H [pos="0.350,0.312"]
W [pos="0.452,0.484"]
A -> H
A -> W
H -> W
}
')
plot(g)

Use a linear regression to estimate the total (not just direct) causal effect of each year of growth on weight. Be sure to carefully consider the priors. Try using prior predictive simulation to assess what they imply

library(rethinking)

## Loading required package: parallel

## rethinking (Version 2.13.2)

## 
## Attaching package: 'rethinking'

## The following object is masked from 'package:stats':
## 
##     rstudent

data("Howell1")
d <- Howell1[Howell1$age<13,]

mod1 <- quap ( alist (
  weight ~ dnorm(m,s),
  m <- a + b * age,
  a ~ dlnorm(1,5),
  b ~ dlnorm(0,1),
  s ~ exp(1)
), data = d)
precis(mod1)

##       mean         sd     5.5%    94.5%
## a 7.442333 0.39079789 6.817762 8.066903
## b 1.341089 0.05909368 1.246646 1.435532

priors <- extract.prior(mod1)
post <- extract.samples(mod1)

plot(d$age , d$weight , lwd=3, col='blue')
for ( i in 1:5 ) abline( post$a[i] , post$b[i] , lwd=3 , col='green')

for ( i in 1:10 ) abline( priors$a[i] , priors$b[i] , lwd=3 , col='red' )

Question 2

Now suppose the causal association between age and weight might be different for boys and girls. Use a single linear regression, with a categorical variable for sex, to estimate the total causal effect of age on weight separately for boys and girls. How do girls and boys differ? Provide one or more posterior contrasts as a summary.

#dat$sex = dat$male + 1

dat <- list(W=d$weight, A=d$age,S=d$male+1)

# mod2 <- quap ( alist(
#   W ~ dnorm (m, s),
#   m <- a + b1 * A + b2 * S + b3 * S * A,
#   a ~ dlnorm(1,5),
#   b1 ~ dlnorm(0,1),
#   b2 ~ dnorm(0,1),
#   b3 ~ dnorm(0,1),
#   s ~ exp(1)
# ), data = dat2)

mod3 <- quap ( alist(
  W ~ dnorm (m, sigma),
  m <- a[S] + b[S]*A,
  a[S] ~ dnorm(1,5),
  b[S] ~ dlnorm(0,1),
  sigma ~ exp(1)
), data = dat)

precis(mod3)
priors <- extract.prior(mod3)
post <- extract.samples(mod3)
#sq <- seq(from=0, to = 12, length.out = 13)
sq <- 0:12
muF <- link(mod3, data=list(A=sq, S=rep(1,13)))
muM <- link(mod3, data=list(A=sq, S=rep(2,13)))

plot(d$age, d$weight, lwd=3, col=ifelse(d$male==1,4,2), xlim=c(0,12),ylim=c(0,30))
shade(apply(muF,2,PI,0.99), sq, col=col.alpha(2,0.5))
lines( sq, apply(muF, 2, mean), lwd=3, col=2)

library(rethinking)
data(Howell1)
d <- Howell1
d <- d[ d$age < 13 , ]

dat <- list(W=d$weight,A=d$age,S=d$male+1)

m3 <- quap(
    alist(
        W ~ dnorm( mu , sigma ),
        mu <- a[S] + b[S]*A,
        a[S] ~ dnorm(5,1),
        b[S] ~ dlnorm(0,1),
        sigma ~ dexp(1)
    ), data=dat )

# blank(bty="n")
plot( d$age , d$weight , lwd=3, col=ifelse(d$male==1,4,2) , xlab="age (years)" , ylab="weight (kg)" )
Aseq <- 0:12

# girls
muF <- link(m3,data=list(A=Aseq,S=rep(1,13)))
shade( apply(muF,2,PI,0.99) , Aseq , col=col.alpha(2,0.5) )
lines( Aseq , apply(muF,2,mean) , lwd=3 , col=2 )

# boys
muM <- link(m3,data=list(A=Aseq,S=rep(2,13)))
shade( apply(muM,2,PI,0.99) , Aseq , col=col.alpha(4,0.5) )
lines( Aseq , apply(muM,2,mean) , lwd=3 , col=4 )

mu_line <- apply(mu,2,mean)

plot(dat$age , dat$weight , lwd=3, col=dat$male+4) #Males are the darker blue dots, females are the lighter blue dots

for ( i in 1:10 ) {
  abline( priors$a[i], priors$b1[i] , lwd=3 , col= i )
  abline( priors$a[i] + priors$b2, priors$b1[i] + priors$b3[i], lwd=3 , col= i ) 
} #gives two lines of the same color for every prior simulation, with one being male, the other female

for ( i in 1:5 ) lines(sq, mu_line, lwd=2, col='green')

n_sim = 100

muF_sim <- sim(m3,data=list(A=Aseq,S=rep(1,13)), n_sim)
muM_sim <- sim(m3,data=list(A=Aseq,S=rep(2,13)), n_sim)

mu_contrast <- muF_sim
for (i in 1:13) mu_contrast[,i] <- muM_sim[,i] - muF_sim[,i]

plot(NULL, xlim=c(0,13), ylim=c(-15,15), xlab="age", ylab="weight differnce (boy-girls)")

for (p in c(0.5, 0.67, 0.89, 0.99))
  shade( apply(mu_contrast,2,PI,prob=p), Aseq)

abline(h=0,lty=2,lwd=2)

for (i in 1:13) points( rep(i-1,n_sim), mu_contrast[1:n_sim,i], col=ifelse(mu_contrast[1:n_sim,i]>0,4,2), lwd=3)

Question 3

For this problem, we want to compare the difference between Frequentist and Bayesian linear regressions. We’re going to use the similar functions from section 4.5.

To begin, I have provided the same code to get the model m4.5 (i.e., run R code 4.65). Please run it and refresh yourself.

# R code 4.65 + 4.66 (precis)
library(rethinking)
data(Howell1)
d <- Howell1
d$weight_s <- ( d$weight - mean(d$weight) )/sd(d$weight)
d$weight_s2 <- d$weight_s^2

m4.5 <- quap(
  alist(
    height ~ dnorm( mu , sigma ) ,
    mu <- a + b1*(weight_s) + b2 * weight_s2,
    a ~ dnorm( 178 , 20 ) ,
    b1 ~ dlnorm( 0 , 1 ) ,
    b2 ~ dnorm( 0, 1 ) ,
    sigma ~ dunif( 0 , 50 )
) , data=d )

precis( m4.5 )

##             mean        sd       5.5%      94.5%
## a     146.057414 0.3689755 145.467720 146.647108
## b1     21.733062 0.2888889  21.271361  22.194762
## b2     -7.803268 0.2741838  -8.241467  -7.365069
## sigma   5.774473 0.1764650   5.492447   6.056498

Now modify m4.5 model by relaxing our “positive relationship” (aka lognormal) assumption for the b1 variable by modifying it’s prior as dnorm( 0 , 1 ) and create a new model called m4.5b. Run precis(m4.5b).

m4.5b <- quap(
  alist(
    height ~ dnorm( mu , sigma ) ,
    mu <- a + b1*(weight_s) + b2 * weight_s2,
    a ~ dnorm( 178 , 20 ) ,
    b1 ~ dnorm(1 , exp(1) ) ,
    b2 ~ dnorm( 0, 1 ) ,
    sigma ~ dunif( 0 , 50 )
) , data=d )

precis( m4.5b )

##             mean        sd       5.5%      94.5%
## a     146.162029 0.3678600 145.574117 146.749940
## b1     21.517291 0.2864453  21.059496  21.975086
## b2     -7.908414 0.2728626  -8.344502  -7.472327
## sigma   5.769437 0.1760331   5.488102   6.050772

Now, run a frequentist regression of m4.5b by using the lm function. I have provided this code.

# hint: you need to only remove the eval=FALSE so this code runs
fm <- lm(height ~ weight_s + weight_s2, data = d)
names(fm$coefficients) <- c('a','b1','b2') # rename coef for consistency
fm

## 
## Call:
## lm(formula = height ~ weight_s + weight_s2, data = d)
## 
## Coefficients:
##       a       b1       b2  
## 146.660   21.415   -8.412

Now compare all three models by using the coeftab() and putting all three of the models as parameters. You can also run a plot() on the coeftab() function to run a plot of the effects.

# type in your code here
coeftab(m4.5,m4.5b,fm)

##       m4.5    m4.5b   fm     
## a      146.06  146.16  146.66
## b1      21.73   21.52   21.41
## b2      -7.80   -7.91   -8.41
## sigma    5.77    5.77      NA
## nobs      544     544     544

plot(coeftab(m4.5,m4.5b,fm))

How different are the models?

m4.5 and fm provide nearly identical estimates for the equation of the line (not withstanding that one has a sigma). m4.5b gives a greater slope, with both a higher b1 and b2 values. This can most likely be explained by negative values for b1 by nature of being normal potentially skewing the data, even when negative b1 would be nonsensical. The sigmas of m4.5 and m4.5b do indicate comparable levels of precision, leaving an ambiguous validity to m4.5b.

Question 4

For this problem, we’re going to reuse the same model (m4.5) from Question 3 and run prior predictive simulations to understand the role of different priors. For help, see 5.4-5.5 code in the book.

d <- Howell1

N=42
a = rnorm(N, 178 , 20 )
b1 = rlnorm(N, 0 , 1 ) 
b2 = rnorm(N, 0, 1 ) 

d$weight_s <- ( d$weight - mean(d$weight) )/sd(d$weight)
d$weight_s2 <- d$weight_s^2

plot( NULL , xlim=range(d$weight) , ylim=c(-100,400) ,
xlab="weight" , ylab="height" )
mtext( "b ~ dnorm(0,1)" )

xbar <- mean(d$weight)
for ( i in 1:N ) curve( a[i] + b1[i]*(x - xbar)*d$weight_s[i] + b2[i]*(x-xbar)*d$weight_s2[i],
from=min(d$weight) , to=max(d$weight) , add=TRUE ,
col=i)

Change the priors on the b2 coefficient to b2 ~ dnorm(0, 10) and rerun the prior predictive simulation.

b2 <-  rnorm(N,0,10)
  
plot( NULL , xlim=range(d$weight) , ylim=c(-100,400) ,
xlab="weight" , ylab="height" )
mtext( "b ~ dnorm(0,10)" )

for ( i in 1:N ) curve( a[i] + b1[i]*(x - xbar)*d$weight_s[i] + b2[i]*(x-xbar)*d$weight_s2[i],
from=min(d$weight) , to=max(d$weight) , add=TRUE ,
col=i)

Now, change the priors on the beta coefficients to more “flat, very uninformative” priors, dnorm(0, 100) for b1 and b2. Rerun a similar prior predictive simulation.

b2 <-  rnorm(N,0,100)
b1 <-  rnorm(N,0,100)
  
plot( NULL , xlim=range(d$weight) , ylim=c(-100,400) ,
xlab="weight" , ylab="height" )
mtext( "b ~ dnorm(0,10)" )

for ( i in 1:N ) curve( a[i] + b1[i]*(x - xbar)*d$weight_s[i] + b2[i]*(x-xbar)*d$weight_s2[i],
from=min(d$weight) , to=max(d$weight) , add=TRUE ,
col=i)

Optional Challenge (Not graded)

Return to data(cherry_blossoms) and model the association between blossom date (day) and March temperature (temp). Note that there are many missing values in both variables. You may consider a linear model, a polynomial, or a spline on temperature. How well does temperature trend predict the blossom trend?