Anly505_Assignment7

Questions

7-1. When comparing models with an information criterion, why must all models be fit to exactly the same observations? What would happen to the information criterion values, if the models were fit to different numbers of observations? Perform some simulations.

library(ggplot2)
library(tidyverse)
library(rethinking)
data("Howell1")
d <- Howell1[complete.cases(Howell1), ]
d1 <- d[sample(1:nrow(d), size = 100, replace = FALSE), ]
d2 <- d[sample(1:nrow(d), size = 200, replace = FALSE), ]
d3 <- d[sample(1:nrow(d), size = 400, replace = FALSE), ]
m1 <- map(
  alist(
    height ~ dnorm(mu, sigma),
    mu <- a + b * log(weight)
  ),
  data = d1,
  start = list(a = mean(d1$height), b = 0, sigma = sd(d1$height))
)
m2 <- map(
  alist(
    height ~ dnorm(mu, sigma),
    mu <- a + b * log(weight)
  ),
  data = d2,
  start = list(a = mean(d2$height), b = 0, sigma = sd(d3$height))
)
m3 <- map(
  alist(
    height ~ dnorm(mu, sigma),
    mu <- a + b * log(weight)
  ),
  data = d3,
  start = list(a = mean(d3$height), b = 0, sigma = sd(d3$height))
)
(model.compare <- compare(m1, m2, m3))

##         WAIC       SE    dWAIC      dSE    pWAIC        weight
## m1  577.8634 13.37284    0.000       NA 2.849169  1.000000e+00
## m2 1178.1954 19.40190  600.332 19.71694 2.870596 4.360811e-131
## m3 2432.7922 34.40122 1854.929 22.14983 3.486548  0.000000e+00

7-2. What happens to the effective number of parameters, as measured by PSIS or WAIC, as a prior becomes more concentrated? Why? Perform some simulations.

d <- Howell1[complete.cases(Howell1), ]
d$heightStd <- (log(d$height) - mean (log(d$height))) / sd(log(d$height))
d$weightStd <- (log(d$weight) - mean (log(d$weight))) / sd(log(d$weight))


m4 <- map(
  alist(
    heightStd ~ dnorm(mu, sigma),
    mu <- a + b * weightStd,
    a ~ dnorm(0, 10),
    b ~ dnorm(1, 10),
    sigma ~ dunif(0, 10)
  ),
  data = d
)
m5 <- map(
  alist(
    heightStd ~ dnorm(mu, sigma),
    mu <- a + b * weightStd,
    a ~ dnorm(0, 1),
    b ~ dnorm(1, 1),
    sigma ~ dunif(0, 1)
  ),
  data = d
)
WAIC(m4, refresh = 0)

##        WAIC     lppd  penalty std_err
## 1 -102.6993 55.60588 4.256241 36.5974

WAIC(m5, refresh = 0)

##        WAIC     lppd penalty  std_err
## 1 -102.7166 55.61221 4.25393 36.42089

7-3. Consider three fictional Polynesian islands. On each there is a Royal Ornithologist charged by the king with surveying the bird population. They have each found the following proportions of 5 important bird species:

## [1] 1.626428

## [1] 0.9766913

## [1] 1.238381

Notice that each row sums to 1, all the birds. This problem has two parts. It is not computationally complicated. But it is conceptually tricky. First, compute the entropy of each island’s bird distribution. Interpret these entropy values. Second, use each island’s bird distribution to predict the other two. This means to compute the KL divergence of each island from the others, treating each island as if it were a statistical model of the other islands. You should end up with 6 different KL divergence values. Which island predicts the others best? Why?

DKL <- function(p,q) sum( p*(log(p)-log(q)) )

Dm <- matrix( NA , nrow=3 , ncol=3 )
Dm[1,1] <- DKL (island1, island1)
Dm[1,2] <- DKL (island1, island2)
Dm[1,3] <- DKL (island1, island3)
Dm[2,1] <- DKL (island2, island1)
Dm[2,2] <- DKL (island2, island2)
Dm[2,3] <- DKL (island2, island3)
Dm[3,1] <- DKL (island3, island1)
Dm[3,2] <- DKL (island3, island2)
Dm[3,3] <- DKL (island3, island3)

Dm

##         [,1]     [,2]      [,3]
## [1,] 0.00000 0.981833 0.2847269
## [2,] 2.16909 0.000000 2.2350604
## [3,] 2.27460 3.079917 0.0000000

7-4. Recall the marriage, age, and happiness collider bias example from Chapter 6. Run models m6.9 and m6.10 again (page 178). Compare these two models using WAIC (or PSIS, they will produce identical results). Which model is expected to make better predictions? Which model provides the correct causal inference about the influence of age on happiness? Can you explain why the answers to these two questions disagree?

d2 <- sim_happiness(seed = 1977, N_years = 1000)

dAdults <- d2[d2$age > 17, ] 
dAdults$A <- (dAdults$age - 18) / (65 - 18)

dAdults$mid <- dAdults$married + 1
m6.9 <- quap(
  alist(
    happiness ~ dnorm(mu, sigma),
    mu <- a[mid] + bA * A,
    a[mid] ~ dnorm(0, 1),
    bA ~ dnorm(0, 2),
    sigma ~ dexp(1)
  ),
  data = dAdults
)

m6.10 <- quap(
  alist(
    happiness ~ dnorm(mu, sigma),
    mu <- a + bA * A,
    a ~ dnorm(0, 1),
    bA ~ dnorm(0, 2),
    sigma ~ dexp(1)
  ),
  data = dAdults
)
compare(m6.9, m6.10)

##           WAIC       SE    dWAIC      dSE    pWAIC       weight
## m6.9  2713.971 37.54465   0.0000       NA 3.738532 1.000000e+00
## m6.10 3101.906 27.74379 387.9347 35.40032 2.340445 5.768312e-85

7-5. Revisit the urban fox data, data(foxes), from the previous chapter’s practice problems. Use WAIC or PSIS based model comparison on five different models, each using weight as the outcome, and containing these sets of predictor variables:

avgfood + groupsize + area
avgfood + groupsize
groupsize + area
avgfood
area

Can you explain the relative differences in WAIC scores, using the fox DAG from the previous chapter? Be sure to pay attention to the standard error of the score differences (dSE).

data("foxes")

foxes_q <- foxes %>% 
  as_tibble() %>% 
  mutate(across(-group, standardize))

m7 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- a + Bf*avgfood + Bg*groupsize + Ba*area, 
  a ~ dnorm(0, 0.2), 
  c(Bf, Bg, Ba) ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = foxes_q)

m8 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- a + Bf*avgfood + Bg*groupsize, 
  a ~ dnorm(0, 0.2), 
  c(Bf, Bg) ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = foxes_q)

m9 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- a + Bg*groupsize + Ba*area, 
  a ~ dnorm(0, 0.2), 
  c(Bg, Ba) ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = foxes_q)

m10 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- Bf*avgfood, 
  a ~ dnorm(0, 0.2), 
  Bf ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = foxes_q)

m11 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- a + Ba*area, 
  a ~ dnorm(0, 0.2), 
  Ba ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = foxes_q)

compare(m7, m8, m9, m10, m11)

##         WAIC       SE     dWAIC      dSE    pWAIC      weight
## m7  322.8847 16.27783  0.000000       NA 4.656959 0.463677939
## m9  323.8985 15.68240  1.013749 2.899417 3.718565 0.279308216
## m8  324.1284 16.13964  1.243666 3.598475 3.859897 0.248976214
## m10 331.5849 13.66936  8.700184 7.213498 1.495394 0.005984054
## m11 333.7239 13.79447 10.839215 7.242069 2.650636 0.002053577

Anly505_Assignment7_Saamrth

2022-08-09

Week 7 - Overfitting

Questions