Assignment #7

Questions

7-1. When comparing models with an information criterion, why must all models be fit to exactly the same observations? What would happen to the information criterion values, if the models were fit to different numbers of observations? Perform some simulations.

library(rethinking)
library(ggplot2)
library(tidyverse)
#We know that information criteria is directly related to deviance which is sum, so more observations result in higher deviance or sum, this in turn makes model with less observations not so useful compared to one with higher observations.

  sim <- function(N){
  data <- tibble(x = rnorm(N), 
                y = rnorm(N)) %>% 
    mutate(across(everything(), standardize)) 
  
  alist(
    x ~ dnorm(mu, 1), 
    mu <- a + By*y,
    a ~ dnorm(0.1, 0.4), 
    By ~ dnorm(0.5, 0.9) 
  ) %>% 
    quap(data = data) %>% 
    WAIC() %>% 
    as_tibble() %>% 
    pull(WAIC)
  }
  
seq(300, 900, by = 100) %>%
  map_dbl(sim) %>% 
  enframe(name = "observations", value = "WAIC value") %>% 
  mutate(observations = seq(300, 900, by = 100))

## # A tibble: 7 x 2
##   observations `WAIC value`
##          <dbl>        <dbl>
## 1          300         854.
## 2          400        1138.
## 3          500        1422.
## 4          600        1706.
## 5          700        1989.
## 6          800        2273.
## 7          900        2557.

#We can see in the tibble as observations increse, WAIC value also increases

7-2. What happens to the effective number of parameters, as measured by PSIS or WAIC, as a prior becomes more concentrated? Why? Perform some simulations.

  sim1 <- function(prior){
  data <- tibble(x = rnorm(20), 
                y = rnorm(20)) %>% 
    mutate(across(everything(), standardize)) 
  
  suppressMessages(
    alist(
      x ~ dnorm(mu, 1), 
      mu <- a + By*y,
      a ~ dnorm(0, prior), 
      By ~ dnorm(0, prior)
    ) %>% 
      quap(data = list(x = data$x, y = data$y, prior = prior)) %>% 
      PSIS() %>% 
      as_tibble() %>% 
      pull(penalty)
  )
  }


seq(1, 0.1, length.out = 20) %>%
  purrr::map_dbl(sim1) %>% 
  enframe(name = "b_prior", value = "n_params") %>% 
  mutate(b_prior = seq(1, 0.1, length.out = 20))

## # A tibble: 20 x 2
##    b_prior n_params
##      <dbl>    <dbl>
##  1   1        2.13 
##  2   0.953    1.46 
##  3   0.905    2.05 
##  4   0.858    1.50 
##  5   0.811    1.41 
##  6   0.763    1.55 
##  7   0.716    1.61 
##  8   0.668    1.61 
##  9   0.621    1.68 
## 10   0.574    1.01 
## 11   0.526    1.71 
## 12   0.479    1.14 
## 13   0.432    1.43 
## 14   0.384    1.67 
## 15   0.337    2.51 
## 16   0.289    1.40 
## 17   0.242    1.09 
## 18   0.195    0.571
## 19   0.147    0.535
## 20   0.1      0.293

#We can conclude that when prior gets more concentrated then number of effective parameters decrease

7-3. Consider three fictional Polynesian islands. On each there is a Royal Ornithologist charged by the king with surveying the bird population. They have each found the following proportions of 5 important bird species:

Island	Species A	Species B	Species C	Species D	Species E
1	0.2	0.2	0.2	0.2	0.2
2	0.8	0.1	0.05	0.025	0.025
3	0.05	0.15	0.7	0.05	0.05

Notice that each row sums to 1, all the birds. This problem has two parts. It is not computationally complicated. But it is conceptually tricky. First, compute the entropy of each island’s bird distribution. Interpret these entropy values. Second, use each island’s bird distribution to predict the other two. This means to compute the KL divergence of each island from the others, treating each island as if it were a statistical model of the other islands. You should end up with 6 different KL divergence values. Which island predicts the others best? Why?

island %>% 
knitr::kable()

Island	Species A	Species B	Species C	Species D	Species E
Island 1	0.20	0.20	0.20	0.200	0.200
Island 2	0.80	0.10	0.05	0.025	0.025
Island 3	0.05	0.15	0.70	0.050	0.050

island_l <- island %>% 
  janitor::clean_names() %>% 
  pivot_longer(cols = -island, names_to = "species", values_to = "proportion") 


entropy <- function(p){
  -sum(p*log(p))
}
  
  
island_l %>% 
  group_by(island) %>% 
  summarise(entropy = entropy(proportion)) %>% 
  mutate(across(is.numeric, round, 2)) %>% 
  knitr::kable()

island	entropy
Island 1	1.61
Island 2	0.74
Island 3	0.98

kldivergence <- function(p,q) sum(p*(log(p)-log(q)))

island_l %>% 
  select(-species) %>% 
  group_by(island) %>% 
  nest() %>% 
  pivot_wider(names_from = island, values_from = data) %>% 
  janitor::clean_names() %>% 
  mutate(one_v_two = map2_dbl(island_1, island_2, kldivergence),
         two_v_one = map2_dbl(island_2, island_1, kldivergence),
         one_v_three = map2_dbl(island_1, island_3, kldivergence),
         three_v_one = map2_dbl(island_3, island_1, kldivergence),
         two_v_three = map2_dbl(island_2, island_3, kldivergence),
         three_v_two = map2_dbl(island_3, island_2, kldivergence)
         ) %>% 
  select(one_v_two:three_v_two) %>% 
  pivot_longer(cols = everything(), names_to = "Comparison", values_to = "KL Divergence") %>% 
  knitr::kable()

Comparison	KL Divergence
one_v_two	0.9704061
two_v_one	0.8664340
one_v_three	0.6387604
three_v_one	0.6258376
two_v_three	2.0109142
three_v_two	1.8388452

#From results it can be concluded that one predicts the best

7-4. Recall the marriage, age, and happiness collider bias example from Chapter 6. Run models m6.9 and m6.10 again (page 178). Compare these two models using WAIC (or PSIS, they will produce identical results). Which model is expected to make better predictions? Which model provides the correct causal inference about the influence of age on happiness? Can you explain why the answers to these two questions disagree?

data <- sim_happiness(seed=1977, N_years=1000)

data <- data %>% 
  as_tibble() %>% 
  filter(age > 17) %>% 
  mutate(age = (age - 18)/ (65 -18)) %>% 
  mutate(mid = married +1)

model6.9 <- alist(
  happiness ~ dnorm( mu , sigma ),
  mu <- a[mid] + bA*age,
  a[mid] ~ dnorm( 0 , 1 ),
  bA ~ dnorm( 0 , 2 ),
  sigma ~ dexp(1)) %>% 
  quap(data = data) 

model6.10 <- alist(
  happiness ~ dnorm( mu , sigma ),
  mu <- a + bA*age,
  a ~ dnorm( 0 , 1 ),
  bA ~ dnorm( 0 , 2 ),
  sigma ~ dexp(1)) %>% 
  quap(data = data)

c(model6.9, model6.10) %>% 
  map_dfr(WAIC) %>% 
  add_column(model = c("model6.9", "model6.10"), .before = 1) %>% 
  knitr::kable()

model	WAIC	lppd	penalty	std_err
model6.9	2713.971	-1353.247	3.738532	37.54465
model6.10	3101.906	-1548.612	2.340445	27.74379

#Model with higher WAIC score is Model 6.10 as it doesn't take into consideration the causal association. Predictions are better with non-causal association

7-5. Revisit the urban fox data, data(foxes), from the previous chapter’s practice problems. Use WAIC or PSIS based model comparison on five different models, each using weight as the outcome, and containing these sets of predictor variables:

avgfood + groupsize + area
avgfood + groupsize
groupsize + area
avgfood
area

Can you explain the relative differences in WAIC scores, using the fox DAG from the previous chapter? Be sure to pay attention to the standard error of the score differences (dSE).

data("foxes")

data_foxes <- foxes %>% 
  as_tibble() %>% 
  mutate(across(-group, standardize))


model1 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- a + Bf*avgfood + Bg*groupsize + Ba*area, 
  a ~ dnorm(0, 0.2), 
  c(Bf, Bg, Ba) ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = data_foxes)

model2 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- a + Bf*avgfood + Bg*groupsize, 
  a ~ dnorm(0, 0.2), 
  c(Bf, Bg) ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = data_foxes)

model3 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- a + Bg*groupsize + Ba*area, 
  a ~ dnorm(0, 0.2), 
  c(Bg, Ba) ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = data_foxes)

model4 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- Bf*avgfood, 
  a ~ dnorm(0, 0.2), 
  Bf ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = data_foxes)

model5 <- alist(
  weight ~ dnorm(mu, sigma), 
  mu <- a + Ba*area, 
  a ~ dnorm(0, 0.2), 
  Ba ~ dnorm(0, 0.5), 
  sigma ~ dexp(1)) %>% 
  quap(data = data_foxes)

compare(model1, model2, model3, model4, model5) %>% 
  as_tibble(rownames = "model") %>% 
  mutate(across(is.numeric, round, 2)) %>% 
  knitr::kable()

model	WAIC	SE	dWAIC	dSE	pWAIC	weight
model1	322.88	16.28	0.00	NA	4.66	0.46
model3	323.90	15.68	1.01	2.90	3.72	0.28
model2	324.13	16.14	1.24	3.60	3.86	0.25
model4	331.58	13.67	8.70	7.21	1.50	0.01
model5	333.72	13.79	10.84	7.24	2.65	0.00

#As seen from the WAIC values for the models, we notice that models 1-3 and 4-5 have similar values.

#Models 1-3 have groupsize path and 4-5 contain similar info. too (avgfood and area) thats why similar groupings for WAIC values for both groups

Assignment #7

Karan Sahai

2022-08-08

Week 7 - Overfitting

Questions