This chapter extended the basic multilevel strategy of partial pooling to slopes as well as intercepts. Accomplishing this meant modeling covariation in the statistical population of parameters. The LKJcorr prior was introduced as a convenient family of priors for correlation matrices. You saw how covariance models can be applied to causal inference, using instrumental variables and the front-door criterion. Gaussian processes represent a practical method of extending the varying effects strategy to continuous dimensions of similarity, such as spatial, network, phylogenetic, or any other abstract distance between entities in the data.
Place each answer inside the code chunk (grey box). The code chunks should contain a text response or a code that completes/answers the question or activity requested. Make sure to include plots if the question requests them.
Finally, upon completion, name your final output .html
file as: YourName_ANLY505-Year-Semester.html and publish
the assignment to your R Pubs account and submit the link to Canvas.
Each question is worth 5 points.
14-1. Repeat the café robot simulation from the beginning of the chapter. This time, set rho to zero, so that there is no correlation between intercepts and slopes. Plot the posterior distribution. How does the posterior distribution of the correlation reflect this change in the underlying simulation?
library(MASS)
library(ellipse)
# Simulate the population
a <- 3.5 # average morning wait time
b <- (-1) # average difference afternoon wait time
sigma_a <- 1 # std dev in intercepts
sigma_b <- 0.5 # std dev in slopes
rho <- 0 # correlation between intercepts and slopes
# Create relevant variables
Mu <- c(a, b)
cov_ab <- sigma_a * sigma_b * rho
Sigma <- matrix(c(sigma_a^2, cov_ab, cov_ab, sigma_b^2), ncol=2)
# Simulation
N_cafes <- 20
set.seed(6)
vary_effects <- mvrnorm(N_cafes, Mu, Sigma)
a_cafe <- vary_effects[ ,1]
b_cafe <- vary_effects[ ,2]
N_visits <- 10
afternoon <- rep(0:1, N_visits * N_cafes/2)
cafe_id <- rep(1:N_cafes, each=N_visits)
mu <- a_cafe[cafe_id] + b_cafe[cafe_id] * afternoon
sigma <- 0.5 # std dev within cafes
wait <- rnorm(N_visits * N_cafes, mu, sigma)
d <- data.frame(cafe=cafe_id, afternoon=afternoon, wait=wait)
# Model
set.seed(42)
m14.1 <- ulam(
alist(
wait ~ normal(mu, sigma),
mu <- a_cafe[cafe] + b_cafe[cafe] * afternoon,
c(a_cafe,b_cafe)[cafe] ~ multi_normal(c(a,b), Rho, sigma_cafe),
a ~ normal(5, 2),
b ~ normal(-1, 0.5),
sigma_cafe ~ exponential(1),
sigma ~ exponential(1),
Rho ~ lkj_corr(2)
), data=d, chains=4, cores=4, refresh=0
)
precis(m14.1, pars="Rho[1, 2]")
## result
## mean 0.01875514
## sd 0.23532935
## 5.5% -0.36305534
## 94.5% 0.39079610
## n_eff 1900.18424871
## Rhat 0.99876875
# Plot posterior distribution
posterior <- extract.samples(m14.1)
dens(posterior$Rho[ , 1, 2])
legend("topright",
pch=15,
legend=c("Rho"))
# The posterior distribution of the correlation reflects this change because now the posterior for rho centers around 0. This, of course, is by design since we set rho to be equal to 0.
14-2. Fit this multilevel model to the simulated café data: \[\begin{align} \ W_i ∼ Normal(μ_i, σ) \\ \ μ_i = α_{CAFE[i]} + β_{CAFE[i]}A_i \\ \ α_{CAFE} ∼ Normal(α, σ_α) \\ \ β_{CAFE} ∼ Normal(β, σ_β) \\ \ α ∼ Normal(0, 10) \\ \ β ∼ Normal(0, 10) \\ \ σ, σ_α, σ_β ∼ Exponential(1) \\ \end{align}\] Use WAIC to compare this model to the model from the chapter, the one that uses a multi-variate Gaussian prior. Visualize the differences between the posterior means. Explain the result.
# Simulate the population
a <- 3.5 # average morning wait time
b <- (-1) # average difference afternoon wait time
sigma_a <- 1 # std dev in intercepts
sigma_b <- 0.5 # std dev in slopes
rho <- -0.7 # correlation between intercepts and slopes
# Create relevant variables
Mu <- c(a, b)
cov_ab <- sigma_a * sigma_b * rho
Sigma <- matrix(c(sigma_a^2, cov_ab, cov_ab, sigma_b^2), ncol=2)
# Simulation
N_cafes <- 20
set.seed(42)
vary_effects <- mvrnorm(N_cafes, Mu, Sigma)
a_cafe <- vary_effects[ ,1]
b_cafe <- vary_effects[ ,2]
N_visits <- 10
afternoon <- rep(0:1, N_visits * N_cafes/2)
cafe_id <- rep(1:N_cafes, each=N_visits)
mu <- a_cafe[cafe_id] + b_cafe[cafe_id] * afternoon
sigma <- 0.5 # std dev within cafes
wait <- rnorm(N_visits * N_cafes, mu, sigma)
d <- data.frame(cafe=cafe_id, afternoon=afternoon, wait=wait)
# Base model from chapter
set.seed(42)
m14.2_base <- ulam(
alist(
wait ~ normal(mu, sigma),
mu <- a_cafe[cafe] + b_cafe[cafe] * afternoon,
c(a_cafe,b_cafe)[cafe] ~ multi_normal(c(a,b), Rho, sigma_cafe),
a ~ normal(5, 2),
b ~ normal(-1, 0.5),
sigma_cafe ~ exponential(1),
sigma ~ exponential(1),
Rho ~ lkj_corr(2)
), data=d, chains=4, cores=4, iter=2000, log_lik=TRUE, refresh=0
)
precis(m14.2_base)
## mean sd 5.5% 94.5% n_eff Rhat4
## a 3.3361545 0.32022138 2.8323511 3.8531714 4219.538 0.9996885
## b -0.7640929 0.11785851 -0.9527759 -0.5761716 3445.447 1.0002171
## sigma 0.4640983 0.02562056 0.4252650 0.5076492 3362.908 1.0001821
# Model without correlation parameter
m14.2 <- ulam(
alist(
wait ~ dnorm(mu, sigma),
mu <- a_cafe[cafe] + b_cafe[cafe] * afternoon,
a_cafe[cafe] ~ dnorm(a, sigma_alpha),
b_cafe[cafe] ~ dnorm(b, sigma_beta),
a ~ dnorm(0, 10),
b ~ dnorm(0, 10),
sigma ~ dexp(1),
sigma_alpha ~ dexp(1),
sigma_beta ~ dexp(1)
),
data=d, chains=4, cores=4, iter=2000, log_lik=TRUE, refresh=0
)
# Extract posterior results
# Intercepts
a_base_mod <- apply(extract.samples(m14.2_base)$a_cafe, 2, mean)
a_mod <- apply(extract.samples(m14.2)$a_cafe, 2, mean)
# Slopes
b_base_mod <- apply(extract.samples(m14.2_base)$b_cafe, 2, mean)
b_mod <- apply(extract.samples(m14.2)$b_cafe, 2, mean)
# Compare results
compare(m14.2_base, m14.2)
## WAIC SE dWAIC dSE pWAIC weight
## m14.2_base 295.2996 21.14359 0.000000 NA 31.64082 0.6931406
## m14.2 296.9293 20.77566 1.629686 2.006231 32.10835 0.3068594
# Plot difference between the posterior means
plot(a_mod, b_mod,
xlab="Intercept", ylab="Slope",
pch=15, col=rangi2, ylim=c(-1.5, -0.1))
points(a_base_mod, b_base_mod, pch=1)
# Based on the WAIC we see that the value between the base model (correlation-informed) and the no correlation model is similar. This is corroborated by the plot of the posterior means. We see that the correlation-informed model is not too far off from the no correlation model. That said, the correlation-informed model should be more accurate since it takes into account information that is missing from the no correlation model.
14-3. Use WAIC to compare the Gaussian process model of Oceanic tools to the models fit to the same data in Chapter 11. Pay special attention to the effective numbers of parameters, as estimated by WAIC. Explain the result.
# Data
data(Kline)
d <- Kline
# From chapter 11
d$P <- scale(log(d$population))
d$contact_id <- ifelse(d$contact == "high", 2, 1)
dat_11 <- list(T=d$total_tools, P=d$population, cid=d$contact_id)
# Gaussian process model
d$society <- 1:10
data("islandsDistMatrix")
dat_gauss <- list(T=d$total_tools, P=d$population, society=d$society, Dmat=islandsDistMatrix)
# Model
m11.11 <- ulam(
alist(
T ~ dpois(lambda),
lambda <- exp(a[cid]) * P^b[cid] / g,
a[cid] ~ dnorm(1, 1),
b[cid] ~ dexp(1),
g ~ dexp(1)
),
data=dat_11, chains=4, cores=4, iter=2000, log_lik=TRUE, refresh=0
)
m14.3 <- ulam(
alist(
T ~ dpois(lambda),
lambda <- (a * P^b / g) * exp(k[society]),
vector[10]:k ~ multi_normal(0, SIGMA),
matrix[10, 10]:SIGMA <- cov_GPL2(Dmat, etasq, rhosq, 0.01),
c(a, b, g) ~ dexp(1),
etasq ~ dexp(2),
rhosq ~ dexp(0.5)
),
data=dat_gauss, chains=4, cores=4, iter=2000, log_lik=TRUE, refresh=0
)
# Compare results
compare(m11.11, m14.3)
## WAIC SE dWAIC dSE pWAIC weight
## m14.3 67.75565 2.323249 0.00000 NA 4.161317 0.997875113
## m11.11 80.05946 11.075104 12.30382 11.16785 4.808338 0.002124887
# The model with the Gaussian process prior has a lower WAIC. Additionally we see that the Gaussian process model has less effective parameters. This could be an indication of regularization on the model with the Gaussian process prior.
14-4. Let’s revisit the Bangladesh fertility data, data(bangladesh), from the practice problems for Chapter 13. Fit a model with both varying intercepts by district_id and varying slopes of urban by district_id. You are still predicting use.contraception. Inspect the correlation between the intercepts and slopes. Can you interpret this correlation, in terms of what it tells you about the pattern of contraceptive use in the sample? Plot the mean (or median) varying effect estimates for both the intercepts and slopes, by district. Then you can visualize the correlation and maybe more easily think through what it means to have a particular correlation. Plot the predicted proportion of women using contraception, with urban women on one axis and rural on the other. Explain the results.
# Data
data(bangladesh)
d <- bangladesh
dat_list <- list(
C = d$use.contraception,
did=as.integer(as.factor(d$district)),
urban=d$urban
)
# Model
m14.4 <- ulam(
alist(
C ~ bernoulli(p),
logit(p) <- a[did] + b[did] * urban,
c(a, b)[did] ~ multi_normal(c(a_bar, b_bar), Rho, Sigma),
a_bar ~ normal(0, 1),
b_bar ~ normal(0, 0.5),
Rho ~ lkj_corr(2),
Sigma ~ exponential(1)
),
dat=dat_list, chains=4, cores=4, iter=2000, log_lik=TRUE, refresh=0
)
precis(m14.4)
## mean sd 5.5% 94.5% n_eff Rhat4
## a_bar -0.6878679 0.1009263 -0.8463173 -0.5270973 2462.498 1.0001702
## b_bar 0.6411844 0.1601882 0.3895893 0.9048042 2355.963 0.9997593
# b_bar has positive value, which means that contraception is used more amongst the urban population
precis(m14.4, depth=2, pars=c("Rho", "Sigma"))
## mean sd 5.5% 94.5% n_eff Rhat4
## Sigma[1] 0.5788722 0.09928925 0.4297144 0.7441117 816.1521 1.005156
## Sigma[2] 0.7743619 0.21589201 0.4334737 1.1082418 276.7083 1.011742
# Negative correlative between intercepts and slopes
# Plot
posterior <- extract.samples(m14.4)
a <- apply(posterior$a, 2, mean)
b <- apply(posterior$b, 2, mean)
R <- apply(posterior$Rho, 2:3, mean)
s <- apply(posterior$Sigma, 2, mean)
S <-diag(s) %*% R %*% diag(s)
plot(a, b, col=rangi2, xlab="a (Intercept)", ylab="b (Slope - Urban)")
for (l in c(0.5, 0.67, 0.89, 0.97)) {
lines(ellipse(S, centre= c(mean(posterior$a_bar), mean(posterior$b_bar)),level=l), col=col.alpha("black",0.2))
}
# Areas with high usage of contraception which are not urban have a smaller slope, and vice versa. This means that areas with high contraceptive usage does not differ much between urban and non-urban areas.
14-5 Now consider the predictor variables age.centered and living.children, also contained in data(bangladesh). Suppose that age influences contraceptive use (changing attitudes) and number of children (older people have had more time to have kids). Number of children may also directly influence contraceptive use. Draw a DAG that reflects these hypothetical relationships. Then build models needed to evaluate the DAG. You will need at least two models. Retain district and urban, as in 14-4. What do you conclude about the causal influence of age and children?
# Draw DAG
dag_14.5 <- dagitty("dag{
Contraception <- Age
Number_of_Children <- Age
Contraception <- Number_of_Children
}"
)
plot(dag_14.5)
# Data
data(bangladesh)
d <- bangladesh
dat_list <- list(
C = d$use.contraception,
did=as.integer(as.factor(d$district)),
urban=d$urban
)
dat_list$children <- standardize(d$living.children)
dat_list$age <- standardize(d$age.centered)
# Model with age
m14.5_age <- ulam(
alist(
C ~ bernoulli(p),
logit(p) <- a[did] + b[did] * urban + bA * age,
c(a, b)[did] ~ multi_normal(c(a_bar, b_bar), Rho, Sigma),
a_bar ~ normal(0, 1),
c(b_bar, bA) ~ normal(0, 0.5),
Rho ~ lkj_corr(2),
Sigma ~ exponential(1)
),
dat=dat_list, chains=4, cores=4, iter=2000, log_lik=TRUE, refresh=0
)
precis(m14.5_age)
## mean sd 5.5% 94.5% n_eff Rhat4
## a_bar -0.68956087 0.09781286 -0.845336355 -0.5341903 3330.752 0.9998721
## bA 0.08400588 0.04991807 0.003319606 0.1654923 6637.447 0.9996921
## b_bar 0.64035589 0.16025871 0.381645435 0.8939028 2036.378 1.0012235
precis(m14.5_age, depth=2, pars=c("Rho", "Sigma"))
## mean sd 5.5% 94.5% n_eff Rhat4
## Sigma[1] 0.5772453 0.09556097 0.4342694 0.7407226 1084.7371 1.000375
## Sigma[2] 0.7829233 0.20691397 0.4556329 1.1160162 331.7157 1.011440
# From here we see that age has a positive effect and is small. Contraception usage is higher as age increase.
# Model with age and number of children
m14.5_age_chil <- ulam(
alist(
C ~ bernoulli(p),
logit(p) <- a[did] + b[did] * urban + bA * age + bN * children,
c(a, b)[did] ~ multi_normal(c(a_bar, b_bar), Rho, Sigma),
a_bar ~ normal(0, 1),
c(b_bar, bA, bN) ~ normal(0, 0.5),
Rho ~ lkj_corr(2),
Sigma ~ exponential(1)
),
dat=dat_list, chains=4, cores=4, iter=2000, log_lik=TRUE, refresh=0
)
precis(m14.5_age_chil)
## mean sd 5.5% 94.5% n_eff Rhat4
## a_bar -0.7160364 0.10209092 -0.8803664 -0.5573534 2945.039 1.0001517
## bN 0.5123331 0.07150175 0.3987077 0.6279361 3886.804 0.9998692
## bA -0.2630735 0.07082144 -0.3758186 -0.1516876 4582.690 0.9994240
## b_bar 0.6896799 0.16067994 0.4328458 0.9445034 1888.957 1.0040296
precis(m14.5_age_chil, depth=2, pars=c("Rho", "Sigma"))
## mean sd 5.5% 94.5% n_eff Rhat4
## Sigma[1] 0.6085725 0.1027031 0.4566243 0.7768011 1169.5920 1.002468
## Sigma[2] 0.7775375 0.2058840 0.4665425 1.1216542 383.3598 1.002112
# Compare results
compare(m14.5_age, m14.5_age_chil)
## WAIC SE dWAIC dSE pWAIC weight
## m14.5_age_chil 2413.032 30.36106 0.00000 NA 55.00186 1.00000e+00
## m14.5_age 2467.014 27.94485 53.98224 14.16972 52.78222 1.89629e-12
# Number of children has an impact on contraception usage as age increases. Additionally, as usage of contraception decrease, the number of children increases, which make sense. Overall, WAIC is lower for the model that includes both age and number of children.