nlss_2022

library(rethinking)
library(cmdstanr)

library(tidyverse)
library(tidybayes)
library(haven)

for( .file in list.files('nlss2022/data/stata format') ) {
  .name <- strsplit(.file, ".", fixed=TRUE)[[1]][1]
  .path <- paste0('nlss2022/data/stata format/', .file)
  .table <- read_dta(.path) |> as_factor()
  assign( .name , .table )
}

df <- poverty |>
  left_join(
    S05 |> filter(food_code == 701)
  ) |>
  mutate(
    potato_consumption = ((q05_03 + q05_04 + q05_05) / hhsize ) |> replace_na(0),
    potato_value = ((replace_na(q05_03_b, 0) + replace_na(q05_04_b, 0) + replace_na(q05_05_b, 0)) / hhsize),
    pcep_potato = log(potato_value / pcep_food),
    spending = log(pcep),
  ) |>
  filter(is.finite(pcep_potato) & is.finite(pcep)) |>
  mutate(
    P = standardize(pcep_potato),
    S = standardize(spending)
  )

## Joining with `by = join_by(psu_number, hh_number)`

# raw values
df |>
  mutate(
    S = pcep
  ) |>
  ggplot( aes(S, weight=hhs_wt) ) +
    geom_histogram(bins=20, color="gray")

# standardized values
df |>
  mutate(
    S = pcep |> standardize()
  ) |>
  ggplot( aes(S, weight=hhs_wt) ) +
    geom_histogram(bins=20, color="gray") +
    labs(x="PCEP (standardized)")

# logscale values
df |>
  mutate(
    S = pcep |> log()
  ) |>
  filter( is.finite(S) ) |>
  ggplot( aes(S, weight=hhs_wt) ) +
    geom_histogram(bins=20, color="gray") +
    labs(x="PCEP (logscale)")

df |>
  mutate(
    S = pcep |> log() |> standardize()
  ) |>
  filter( is.finite(S) ) |>
  ggplot( aes(S, weight=hhs_wt) ) +
    geom_histogram(bins=20, color="gray") +
    labs(x="PCEP (logscale and standardized)")

# standardized and logscale densities
df |>
  ggplot(mapping = aes(x = P)) +
    geom_density() +
    labs(
      x = "Potato consumption as a proportion of overall food consumption (standardized, logscale)"
    )

df |>
  ggplot(mapping = aes(x = S)) +
    geom_density() +
    labs(
      x = "Per Capital total spending (standardized, logscale)"
    )

df |>
  ggplot(mapping = aes(x = S, y = P)) +
    geom_point(shape=1, color="blue", alpha=0.2) +
    labs(x = "Total Spending", y="Potato Consumption")

model <- quap(
  alist(
    P ~ dnorm( mu , sigma ),
    mu <- a + b * S, 
    a ~ dnorm( 0, 2 ),
    b ~ dnorm( 0 , 2 ),
    sigma ~ dunif( 0 , 5 )
  ),
  data=df
)
precis(model, corr=TRUE, digits=4)

##                mean          sd        5.5%       94.5%
## a     -0.0000178637 0.009821625 -0.01571472  0.01567899
## b     -0.3843162628 0.009822181 -0.40001400 -0.36861852
## sigma  0.9231916433 0.006945525  0.91209135  0.93429193

# add some summary statistics
tibble(
  S = seq(from = -3, to = 6.5, by = 0.1)
) |>
  mutate(
    mu = map(S, \(x) link(model, data = list(S = x))),
    mu.avg = map_dbl(mu, mean),
    mu.min = map_dbl(mu, quantile, 0.025),
    mu.max = map_dbl(mu, quantile, 0.975),
    sim = map(S, \(x) sim(model, data = list(S = x))),
    sim.min = map_dbl(sim, quantile, 0.025),
    sim.max = map_dbl(sim, quantile, 0.975),
  ) |>
  ggplot(mapping = aes(x = S)) +
    geom_ribbon(aes(ymin=sim.min, ymax=sim.max), color="lightgray", alpha=0.2) +
    geom_ribbon(aes(ymin=mu.min, ymax=mu.max), color="gray", alpha=0.2) +
    geom_line(aes(y = mu.avg)) +
    geom_point(data=df, aes(y = P), shape=1, color="blue", alpha=0.2) +
    labs(x = "Total Spending", y="Potato Consumption")

.linked <- link(model)
.simmed <- sim(model)
df |>
  mutate(
    reg.avg = .linked |> apply(2, mean),
    reg.min = .linked |> apply(2, quantile, 0.025),
    reg.max = .linked |> apply(2, quantile, 0.975),
    sim.min = .simmed |> apply(2, quantile, 0.1),
    sim.max = .simmed |> apply(2, quantile, 0.9),
  ) |>
  ggplot(mapping = aes(x = S, y = P)) +
    geom_ribbon(aes(ymin=sim.min, ymax=sim.max), color="lightgray", alpha=0.2) +
    geom_ribbon(aes(ymin=reg.min, ymax=reg.max), color="gray", alpha=0.2) +
    geom_line(aes(y = reg.avg)) +
    geom_point(shape=1, color="blue", alpha=0.2) +
    labs(x = "Total Spending", y="Potato Consumption")